// Function to read in kernel from uncompiled source, create the OCL program and build the OCL program // ************************************************************************************************** int CreateProgramAndKernel(cl_context cxGPUContext, cl_device_id* cdDevices, const char *kernel_name, cl_kernel *kernel, bool bDouble) { cl_program cpProgram; size_t szSourceLen; cl_int ciErrNum = CL_SUCCESS; // Read the kernel in from file shrLog("\nLoading Uncompiled kernel from .cl file, using %s\n", clSourcefile); char* cPathAndFile = shrFindFilePath(clSourcefile, cExecutablePath); oclCheckError(cPathAndFile != NULL, shrTRUE); char* pcSource = oclLoadProgSource(cPathAndFile, "", &szSourceLen); oclCheckError(pcSource != NULL, shrTRUE); // Check OpenCL version -> vec3 types are supported only from version 1.1 and above char cOCLVersion[32]; clGetDeviceInfo(cdDevices[0], CL_DEVICE_VERSION, sizeof(cOCLVersion), &cOCLVersion, 0); int iVec3Length = 3; if( strncmp("OpenCL 1.0", cOCLVersion, 10) == 0 ) { iVec3Length = 4; } //for double precision char *pcSourceForDouble; std::stringstream header; if (bDouble) { header << "#define REAL double"; header << std::endl; header << "#define REAL4 double4"; header << std::endl; header << "#define REAL3 double" << iVec3Length; header << std::endl; header << "#define ZERO3 {0.0, 0.0, 0.0" << ((iVec3Length == 4) ? ", 0.0}" : "}"); header << std::endl; } else { header << "#define REAL float"; header << std::endl; header << "#define REAL4 float4"; header << std::endl; header << "#define REAL3 float" << iVec3Length; header << std::endl; header << "#define ZERO3 {0.0f, 0.0f, 0.0f" << ((iVec3Length == 4) ? ", 0.0f}" : "}"); header << std::endl; } header << pcSource; pcSourceForDouble = (char *)malloc(header.str().size() + 1); szSourceLen = header.str().size(); #ifdef WIN32 strcpy_s(pcSourceForDouble, szSourceLen + 1, header.str().c_str()); #else strcpy(pcSourceForDouble, header.str().c_str()); #endif // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&pcSourceForDouble, &szSourceLen, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateProgramWithSource\n"); // Build the program with 'mad' Optimization option #ifdef MAC char *flags = "-cl-fast-relaxed-math -DMAC"; #else char *flags = "-cl-fast-relaxed-math"; #endif ciErrNum = clBuildProgram(cpProgram, 0, NULL, flags, NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclNbody.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } shrLog("clBuildProgram\n"); // create the kernel *kernel = clCreateKernel(cpProgram, kernel_name, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("clCreateKernel\n"); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(*kernel, cdDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) { shrLog( "ERROR: Minimum work-group size 256 required by this application is not supported on this device.\n"); exit(0); } free(pcSourceForDouble); return 0; }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { int patchbfi = 0; cl_int status = 0; unsigned int i; _clState *clState = calloc(1, sizeof(_clState)); cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)"); return NULL; } if (numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)"); return NULL; } for(i = 0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)"); free(platforms); return NULL; } platform = platforms[i]; if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } size_t nDevices; cl_uint numDevices; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device IDs (num)"); return NULL; } cl_device_id *devices; if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device IDs (list)"); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for(i=0; i<numDevices; i++) { char pbuff[100]; status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device Info"); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { char pbuff[100]; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, &nDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Getting Device Info"); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)"); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS"); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = patchbfi = 1; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT"); return NULL; } if (opt_debug) applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE"); return NULL; } if (opt_debug) applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line */ if (clState->preferred_vwidth > 1) clState->preferred_vwidth = 2; if (opt_vectors) clState->preferred_vwidth = opt_vectors; if (opt_worksize && opt_worksize <= clState->max_work_size) clState->work_size = opt_worksize; else clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->preferred_vwidth; /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin */ char binaryfilename[255]; char numbuf[10]; char filename[16]; if (chosen_kernel == KL_NONE) { if (clState->hasBitAlign) chosen_kernel = KL_PHATK; else chosen_kernel = KL_POCLBM; } switch (chosen_kernel) { case KL_POCLBM: strcpy(filename, "poclbm110817.cl"); strcpy(binaryfilename, "poclbm110817"); break; case KL_NONE: /* Shouldn't happen */ case KL_PHATK: strcpy(filename, "phatk110817.cl"); strcpy(binaryfilename, "phatk110817"); break; } FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source, *rawsource = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; if (!rawsource) return NULL; source = malloc(pl); if (!source) { applog(LOG_ERR, "Unable to malloc source"); return NULL; } binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to malloc binary_sizes"); return NULL; } binaries = (char **)malloc(sizeof(char *)*nDevices); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to malloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->hasBitAlign) strcat(binaryfilename, "bitalign"); strcat(binaryfilename, "v"); sprintf(numbuf, "%d", clState->preferred_vwidth); strcat(binaryfilename, numbuf); strcat(binaryfilename, "w"); sprintf(numbuf, "%d", (int)clState->work_size); strcat(binaryfilename, numbuf); strcat(binaryfilename, "long"); sprintf(numbuf, "%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { if (opt_debug) applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { if (opt_debug) applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } binary_sizes[gpu] = binary_stat.st_size; binaries[gpu] = (char *)malloc(binary_sizes[gpu]); if (unlikely(!binaries[gpu])) { applog(LOG_ERR, "Unable to malloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) { applog(LOG_ERR, "Unable to fread binaries[gpu]"); fclose(binaryfile); goto build; } fclose(binaryfile); clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); return NULL; } if (opt_debug) applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); free(binaries[gpu]); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: memcpy(source, rawsource, pl); /* Patch the source file with the preferred_vwidth */ if (clState->preferred_vwidth > 1) { char *find = strstr(source, "VECTORSX"); if (unlikely(!find)) { applog(LOG_ERR, "Unable to find VECTORSX in source"); return NULL; } find += 7; // "VECTORS" if (clState->preferred_vwidth == 2) strncpy(find, "2", 1); else strncpy(find, "4", 1); if (opt_debug) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth); } /* Patch the source file defining BITALIGN */ if (clState->hasBitAlign) { char *find = strstr(source, "BITALIGNX"); if (unlikely(!find)) { applog(LOG_ERR, "Unable to find BITALIGNX in source"); return NULL; } find += 8; // "BITALIGN" strncpy(find, " ", 1); if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BITALIGN"); } else if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BITALIGN patch"); if (patchbfi) { char *find = strstr(source, "BFI_INTX"); if (unlikely(!find)) { applog(LOG_ERR, "Unable to find BFI_INTX in source"); return NULL; } find += 7; // "BFI_INT" strncpy(find, " ", 1); if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BFI_INT"); } else if (opt_debug) applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BFI_INT patch"); clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)"); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } /* create a cl program executable for all the devices specified */ char CompilerOptions[256]; sprintf(CompilerOptions, "%s%i", "-DWORKSIZE=", (int)clState->work_size); //int n = 1000; //while(n--) // printf("%s", CompilerOptions); //return 1; status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_INFO, "%s", log); return NULL; } status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetPlatformInfo)"); return NULL; } /* copy over all of the generated binaries. */ if (opt_debug) applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]); if (!binary_sizes[gpu]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, may need to reboot!"); return NULL; } binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu]); status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)"); return NULL; } /* Patch the kernel if the hardware supports BFI_INT */ if (patchbfi) { unsigned remaining = binary_sizes[gpu]; char *w = binaries[gpu]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) {patchbfi = 0; goto build;} w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[gpu]; remaining = binary_sizes[gpu]; if (!advance(&w, &remaining, "ELF")) {patchbfi = 0; goto build;} w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; if (opt_debug) applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)"); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); return NULL; } clRetainProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)"); return NULL; } } free(source); free(rawsource); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not a fatal problem, just means we build it again next time */ if (opt_debug) applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } if (binaries[gpu]) free(binaries[gpu]); built: free(binaries); free(binary_sizes); applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d", filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size); /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_INFO, "%s", log); return NULL; } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)"); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)"); return NULL; } clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)"); return NULL; } return clState; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'logb_float.cl' */ source_code = read_buffer("logb_float.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "logb_float", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float)(2.0); /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
void OpenCLExecuter::ocl_filter_shared(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer cl_sampler sampler; // OpenCL sampler cl_kernel ckKernel; // OpenCL kernel int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process // set Local work size dimensions // size_t local_threads[3] ={256,256,64}; // set Global work size dimensions // size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]}; // set Global work size dimensions size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err); printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // create a sampler object sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err); printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // grab input data from OpenGL, compute, copy the results back to OpenGL // Runs asynchronous to host, up until blocking clFinish at the end glFinish(); glFlush(); // grab the OpenGL texture object for read/writing from OpenCL err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL); printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err)); // Execute a kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); /* // Blocking read of results from GPU to Host int size = volobj->texwidth*volobj->texheight*volobj->texdepth; unsigned char* result = new unsigned char[size]; err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i]; delete[] result; */ // copy OpenCL buffer to OpenGl texture size_t corigin[3] = {0,0,0}; size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth}; err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL); printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err)); //make sure we block until we are done. //err = clFinish(ocl_wrapper->commandQue); //printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err)); //release opengl objects now err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0); printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(src_buffer)clReleaseMemObject(src_buffer); if(dst_buffer)clReleaseMemObject(dst_buffer); }
void OpenCLExecuter::ocl_parrallelReduction(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem tmp_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group size_t numWorkGroups; cl_kernel ckKernel; // OpenCL kernel int iNumElements = 65536; //65536 // Length of float arrays to process // set Local work size dimensions szLocalWorkSize = 512; // set Global work size dimensions szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize; //szGlobalWorkSize = iNumElements; numWorkGroups = (float)szGlobalWorkSize/(float)szLocalWorkSize; printf("OPENCL: number of elements: %d\n", (int)iNumElements); printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize); printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize); printf("OPENCL: work groups: %d\n", (int)(numWorkGroups)); //temp array int* data = new int[iNumElements]; for(int i=0; i<iNumElements; i++) data[i] = randomFloat(1.0, (float)iNumElements); data[iNumElements/2] = -100.0; //for(int i=0; i<iNumElements; i++) // printf("data: %d\n", data[i]); size_t global_threads[1] ={iNumElements}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(int) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the temp buffer memory object tmp_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(int) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(int) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "min_reduce", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(int)*szLocalWorkSize, NULL); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&iNumElements); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end int numb_iterations = sqrt((float)numWorkGroups); numb_iterations=0; bool cont = true; Timer timer; timer.startTimer(); //for(int i=0; i<numb_iterations; i++) while(cont) { // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); numb_iterations++; if(data[1]==0) cont = false; //printf("min: %d\n", data[0]); for(int i=0; i<numWorkGroups; i++) printf("min: %d\n", data[i]); } timer.endTimer("GPU find min"); timer.startTimer(); int min=iNumElements; for(int i=0; i<iNumElements; i++) if(data[i]<min) min = data[i]; timer.endTimer("CPU find min"); printf("iters: %d\n", numb_iterations); printf("gpu-min: %d\n", data[0]); printf("cpu-min: %d\n", min); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); // printf("min: %d\n", data[0]); delete[] data; }
void OpenCLExecuter::ocl_filter_multi(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer[MAX_DEVICES]; // OpenCL device source buffer cl_mem dst_buffer[MAX_DEVICES]; // OpenCL device source buffer cl_command_queue queues[MAX_DEVICES]; // OpenCL device queue cl_kernel ckKernel[MAX_DEVICES]; // OpenCL kernel cl_event gpuDone[MAX_DEVICES]; // int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth*3; // Length of float arrays to process int xdim, ydim, zdim; xdim = (float)volobj->texwidth; // (float)ocl_wrapper->numDevices; ydim = (float)volobj->texheight; // (float)ocl_wrapper->numDevices; zdim = (float)volobj->texdepth / (float)ocl_wrapper->numDevices; //Length of array to process int iNumElements = (xdim*ydim*zdim); size_t global_threads[3] = {xdim, ydim, zdim}; //temp array unsigned char** data = new unsigned char*[ocl_wrapper->numDevices]; for(int i=0; i<ocl_wrapper->numDevices; i++) data[i] = new unsigned char[iNumElements]; for(int i=0; i<ocl_wrapper->numDevices; i++) { printf("OPENCL: Computing Device%d\n", i); //create the command queue we will use to execute OpenCL commands queues[i] = clCreateCommandQueue(ocl_wrapper->context, ocl_wrapper->devices[i], 0, &err); printf("OPENCL: clCreateCommandQueue: %s\n", ocl_wrapper->get_error(err)); // allocate the source buffer memory object src_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel[i] = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel[i], 0, sizeof(cl_mem), (void*)&src_buffer[i]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 1, sizeof(cl_mem), (void*)&dst_buffer[i]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 2, sizeof(int), (void*)&global_threads[0]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 3, sizeof(int), (void*)&global_threads[1]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 4, sizeof(int), (void*)&global_threads[2]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); //Prepare data to upload int iOffsetElements = (xdim*ydim*zdim*i); for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++) data[i][j-iOffsetElements] = volobj->texture3d[3*j+0]; // Write data from host to GPU err = clEnqueueWriteBuffer (queues[i], src_buffer[i], CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); } for(int i=0; i<ocl_wrapper->numDevices; i++) { // Launch kernel err = clEnqueueNDRangeKernel (queues[i], ckKernel[i], 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); } for(int i=0; i<ocl_wrapper->numDevices; i++) { // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (queues[i], dst_buffer[i], CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL, &gpuDone[i]); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); } // Synchronize with the GPUs printf("OPENCL: Waiting for devices to sync\n"); clWaitForEvents(ocl_wrapper->numDevices, gpuDone); for(int i=0; i<ocl_wrapper->numDevices; i++) { //read data back int iOffsetElements = (xdim*ydim*zdim*i); for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++) volobj->texture3d[3*j+0] = data[i][j-iOffsetElements]; } for(int i=0; i<ocl_wrapper->numDevices; i++) { // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel[i])clReleaseKernel(ckKernel[i]); //need to release any other OpenCL memory objects here if(dst_buffer[i])clReleaseMemObject(dst_buffer[i]); if(src_buffer[i])clReleaseMemObject(src_buffer[i]); } for(int i=0; i<ocl_wrapper->numDevices; i++) delete[] data[i]; delete[] data; }
void OpenCLExecuter::ocl_filterBoundingBox(int channel, int window_size) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem bbmin_buffer; // OpenCL device source buffer cl_mem bbmax_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel cl_int4 minbb; cl_int4 maxbb; minbb.s[0] = minbb.s[1] = minbb.s[2] = 8192; maxbb.s[0] = maxbb.s[1] = maxbb.s[2] = -8192; int iNumElements = 3*volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object bbmin_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(cl_int4), NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); bbmax_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(cl_int4), NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&bbmin_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&bbmax_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&channel); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(src_buffer)clReleaseMemObject(src_buffer); if(bbmin_buffer)clReleaseMemObject(bbmin_buffer); if(bbmax_buffer)clReleaseMemObject(bbmax_buffer); maxbb.s[0] += (float)window_size/2.0; maxbb.s[1] += (float)window_size/2.0; maxbb.s[2] += (float)window_size/2.0; minbb.s[0] -= (float)window_size/2.0; minbb.s[1] -= (float)window_size/2.0; minbb.s[2] -= (float)window_size/2.0; maxbb.s[0] += 2; maxbb.s[1] += 2; maxbb.s[2] += 2; minbb.s[0] -= 2; minbb.s[1] -= 2; minbb.s[2] -= 2; if(maxbb.s[0]>volobj->texwidth-1) maxbb.s[0] =volobj->texwidth-1; if(maxbb.s[1]>volobj->texheight-1) maxbb.s[1] =volobj->texheight-1; if(maxbb.s[2]>volobj->texdepth-1) maxbb.s[2] =volobj->texdepth-1; if(minbb.s[0]<0) minbb.s[0]=0; if(minbb.s[1]<0) minbb.s[1]=0; if(minbb.s[2]<0) minbb.s[2]=0; volobj->boundingboxSize.x = ((maxbb.s[0])-(minbb.s[0]-1)); volobj->boundingboxSize.y = ((maxbb.s[1])-(minbb.s[1]-1)); volobj->boundingboxSize.z = ((maxbb.s[2])-(minbb.s[2]-1)); volobj->boundingboxCentre.x = 0.0; //-(((float)boundingboxSize.x)/2.0); volobj->boundingboxCentre.y = 0.0; //-(((float)boundingboxSize.y)/2.0); volobj->boundingboxCentre.z = 0.0; //-(((float)boundingboxSize.z)/2.0); volobj->boundingboxMin = Vector(minbb.s[0], minbb.s[1], minbb.s[2]); volobj->boundingboxMax = Vector(maxbb.s[0], maxbb.s[1], maxbb.s[2]); printf("min: %f, %f, %f\n", volobj->boundingboxMin.x, volobj->boundingboxMin.y, volobj->boundingboxMin.z); printf("max: %f, %f, %f\n", volobj->boundingboxMax.x, volobj->boundingboxMax.y, volobj->boundingboxMax.z); }
int main(void) { // se crea los 2 vectores de entrada int i; const int LIST_SIZE = 1024; int *A = (int*)malloc(sizeof(int)*LIST_SIZE); int *B = (int*)malloc(sizeof(int)*LIST_SIZE); for(i = 0; i < LIST_SIZE; i++) { A[i] = i; B[i] = LIST_SIZE - i; } // cargamos el kernel en source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("vector_add_kernel.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // obtenemos las plataformas y informacion de los devices cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // creamos un contexto OpenCL cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); // creamos la cola de comandos cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // creamos el buffer de memoria en el device para cada vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); // copiamos los vectores A y B a sus respectivas memorias buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); // creamos un programa para el kernel cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // generamos el programa ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); // creamos el kernel cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); // establecemos los argumentos del kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); // ejecutamos el kernel de la lista size_t global_item_size = LIST_SIZE; size_t local_item_size = 64; // dividimos los work items en grupos de 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // copiamos la memoria buffer C del device hacia la variable local C int *C = (int*)malloc(sizeof(int)*LIST_SIZE); ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); // muestra el resultado for(i = 0; i < LIST_SIZE; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); free(A); free(B); free(C); return 0; }
int main(int argc, char **argv) { printf("start \n"); int x, y, nsteps, i, j; float *u_h; double *f_h; //pointers to host memory int ArraySizeX = 5122; int ArraySizeY = 5122; double n, ux, uy, uxx, uxy, uyy, usq; FILE *fp; size_t size = ArraySizeX*ArraySizeY*sizeof(float); size_t size1 = ArraySizeX*ArraySizeY*9*sizeof(double); u_h = (float *)calloc(ArraySizeX*ArraySizeY,sizeof(float)); f_h = (double *)calloc(ArraySizeX*ArraySizeY*9,sizeof(double)); printf("initialization \n"); // initialization for( x = 0;x<ArraySizeX;x++){ for( y =0;y<ArraySizeY;y++){ // define the macroscopic properties of the initial condition. n = 1 + Amp2*exp(-(pow(x-ArraySizeX/2,2)+pow(y-ArraySizeY/2,2))/Width); ux = 0; uy = 0; // intialize f to be the local equilibrium values uxx = ux*ux; uyy = uy*uy; uxy = 2*ux*uy; usq = uxx+ uyy; f_h[x*ArraySizeY*9+y*9] = w1*n*(1-1.5*usq); f_h[x*ArraySizeY*9+y*9+1] = w2*n*(1+3*ux+4.5*uxx-1.5*usq); f_h[x*ArraySizeY*9+y*9+2] = w2*n*(1-3*ux+4.5*uxx-1.5*usq); f_h[x*ArraySizeY*9+y*9+3] = w2*n*(1+3*uy+4.5*uyy-1.5*usq); f_h[x*ArraySizeY*9+y*9+4]= w2*n*(1-3*uy+4.5*uyy-1.5*usq); f_h[x*ArraySizeY*9+y*9+5] = w3*n*(1+3*(ux+uy)+4.5*(uxx+uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+6] = w3*n*(1+3*(-ux+uy)+4.5*(uxx-uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+7] = w3*n*(1+3*(-ux-uy)+4.5*(uxx+uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+8] = w3*n*(1+3*(ux-uy)+4.5*(uxx-uxy+uyy)-1.5*usq); } } cl_event event; cl_ulong time_start, time_end, total_time; // use this to check the output of each API call cl_int status; // retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0,NULL,&numPlatforms); chk(status, "clGetPlatformIDs0"); // allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id *) malloc(numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); chk(status, "clGetPlatformIDs1"); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); chk(status, "clGetDeviceIDs0"); // Allocate enough space for each device cl_device_id *devices = NULL; devices = (cl_device_id *) malloc(numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); chk(status, "clGetDeviceIDs1"); // Create a context and associate it with devices cl_context context; context = clCreateContext(NULL,numDevices, devices, NULL, NULL, &status); chk(status,"clCreateContext"); // Create a command queue and associate it with device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE,&status); chk(status,"clCreateCommandQueue"); // Create Buffer objects on devices cl_mem u_d, f_d; u_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); chk(status,"clCreatebuffer"); f_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size1, NULL, &status); chk(status, "clCreatebuffer"); // perform computing on GPU // copy data from host to device status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL); chk(status,"ClEnqueueWriteBuffer"); status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL); chk(status, "clEnqueueWriteBuffer"); // create program with source code cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status); chk(status, "clCreateProgramWithSource"); // Compile program for the device status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL); // chk(status, "ClBuildProgram"); if(status != CL_SUCCESS){ printf("clBuildProgram failed (%d) \n", status); size_t log_size; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char *log = (char *) malloc(log_size); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL); printf("%s\n", log); exit(-1); } printf("successfully built program \n"); // Create lattice-boltzman kernel cl_kernel kernel, kernel1; kernel = clCreateKernel(program, "lbiteration", &status); kernel1 = clCreateKernel(program, "Denrho", &status); chk(status, "clCreateKernel"); printf("successfully create kernel \n"); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d); status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d); status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d); status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX); status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX); status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY); status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY); chk(status, "clSerKernelArg"); // set the work dimensions size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y}; int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2); int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2); size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y}; // loop the kernel for( nsteps = 0; nsteps < 100; nsteps++){ status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event); clWaitForEvents(1 , &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; } printf("Good so far \n"); status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event); chk(status, "clEnqueueNDR"); clWaitForEvents(1 , &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("running time is %0.3f \n",(total_time/1000000000.0)); // retrieve data from device status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL); chk(status, "clEnqueueReadBuffer"); // Output results fp = fopen("SolutionCL.txt", "wt"); for(i= 0;i<ArraySizeX;i++){ for(j=0;j<ArraySizeY;j++) fprintf(fp, " %f", u_h[i*ArraySizeY+j]); fprintf(fp, "\n"); } fclose(fp); //cleanup clReleaseKernel(kernel); clReleaseKernel(kernel1); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(u_d); clReleaseMemObject(f_d); clReleaseContext(context); free(u_h); free(f_h); free(platforms); free(devices); return 0; }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- void setup_opencl(int argc, char *argv[]) { cl_int err_code; char *source_dir = "EP"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code); clu_CheckError(err_code, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &err_code); clu_CheckError(err_code, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; sprintf(build_option, "-DM=%d -I.", M); if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ep_cpu.cl"; GROUP_SIZE = 16; } else { source_file = "ep_gpu.cl"; GROUP_SIZE = 64; } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); gq_size = np / GROUP_SIZE * NQ * sizeof(double); gsx_size = np / GROUP_SIZE * sizeof(double); gsy_size = np / GROUP_SIZE * sizeof(double); pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgq"); pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsx"); pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsy"); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create a kernel DTIMER_START(T_OPENCL_API); kernel = clCreateKernel(program, "embar", &err_code); clu_CheckError(err_code, "clCreateKernel()"); DTIMER_STOP(T_OPENCL_API); }
int main(int argc, char** argv) { srand(1000); int i; unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; cl_event mm; size_t dataBytes; size_t kernelLength; cl_int errcode; cl_mem d_A; cl_mem d_B; cl_mem d_C; clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); FILE* fp = fopen("hw2.cl", "r"); fseek (fp , 0 , SEEK_END); const size_t lSize = ftell(fp); rewind(fp); unsigned char* buffer; buffer = (unsigned char*) malloc (lSize); fread(buffer, 1, lSize, fp); fclose(fp); cl_int status; clProgram = clCreateProgramWithBinary(clGPUContext, 1, (const cl_device_id *)clDevices, &lSize, (const unsigned char**)&buffer, &status, &errcode); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); clKernel = clCreateKernel(clProgram, "MM", &errcode); size_t globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); globalWorkSize[0] = 16; globalWorkSize[1] = 16; cl_ulong time_start, time_end, total_time = 0; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &mm); printf("Average time = %lu\n"); clFinish(clCommandQue); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("Average time = %lu\n", total_time); errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
// Create the kernel cl_kernel bones_kernel_<algorithm_name>_0 = clCreateKernel(bones_program, "bones_kernel_<algorithm_name>_0", &bones_errors); error_check(bones_errors); // Set all the arguments to the kernel function int bones_num_args = 0; <kernel_argument_list> // Start the kernel size_t bones_global_worksize[] = {<parallelism>}; bones_errors = clEnqueueNDRangeKernel(bones_queue,bones_kernel_<algorithm_name>_0,1,NULL,bones_global_worksize,NULL,0,NULL,&bones_event); error_check(bones_errors); // Synchronize and clean-up the kernel clFinish(bones_queue); clReleaseKernel(bones_kernel_<algorithm_name>_0);
int crackMD5(char *hash, char *cs, int passlen) { clut_device dev; // device struct cl_event evt; // performance measurement event cl_kernel kernel; // execution kernel cl_int ret; // error code double td; int cs_len, sync_flag; long chunk, disp; unsigned char bin_hash[HASH_SIZE]; cs_len = strlen(cs); sync_flag = 0; strToBin(hash, bin_hash, 2*HASH_SIZE); disp = DISPOSITIONS(cs_len, passlen); chunk = DISP_PER_CORE(disp, AVAILABLE_THREADS); debug("HOST", "Numero di disposizione da calcolare per stream processing unit = %lu\n", chunk); clut_open_device(&dev, PATH_TO_KERNEL); clut_print_device_info(&dev); /* ----------------------------------------- Create execution kernel ----------------------------------------- */ kernel = clCreateKernel(dev.program, KERNEL_NAME, &ret); clut_check_err(ret, "Fallita la creazione del kernel"); /* ----------------------------------- Create memory buffers on the device ----------------------------------- */ cl_mem dchunk = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(long), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del chunk"); cl_mem dhash = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, HASH_SIZE * sizeof(unsigned char), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione dell'hash"); cl_mem charset = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, cs_len * sizeof(char), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del charset"); cl_mem charset_size = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset"); cl_mem dpasslen = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset"); //cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, AVAILABLE_CORES * sizeof(int), NULL, &ret); cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del flag di sync"); cl_mem dcracked = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE, NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro"); cl_mem computed_hash = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE * sizeof(unsigned char), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro"); /* ----------------------------------- Write memory buffers on the device ------------------------------------ */ ret = clEnqueueWriteBuffer(dev.queue, dchunk, CL_TRUE, 0, sizeof(long), &chunk, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura del chunk sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, dhash, CL_TRUE, 0, HASH_SIZE * sizeof(unsigned char), (int *)bin_hash, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura dell'hash sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, charset, CL_TRUE, 0, cs_len * sizeof(char), cs, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura del charset sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, charset_size, CL_TRUE, 0, sizeof(int), &cs_len, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, dpasslen, CL_TRUE, 0, sizeof(int), &passlen, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device"); //ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), &sync_flag, 0, NULL, NULL); ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &sync_flag, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device"); /* --------------------------------- Set the arguments to our compute kernel --------------------------------- */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dchunk); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dhash); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &charset); ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charset_size); ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dpasslen); ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &sync); ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &dcracked); ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &computed_hash); clut_check_err(ret, "Fallito il setting degli argomenti del kernel"); /* ---------------------------------------- Execute the OpenCL kernel ---------------------------------------- */ size_t global_dim[] = { AVAILABLE_THREADS }; ret = clEnqueueNDRangeKernel(dev.queue, kernel, 1, NULL, global_dim, NULL, 0, NULL, &evt); if(ret) clut_check_err(ret, "Fallita l'esecuzione del kernel"); /* -------------------------- Read the device memory buffer to the local variable ---------------------------- */ //int found[80]; int found; int digest[HASH_SIZE/sizeof(int)]; char *password = (char *) malloc(passlen * sizeof(char) + 1); memset(password, 0, passlen * sizeof(char) + 1); //memset(found, 0, AVAILABLE_CORES * sizeof(int)); //ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), found, 0, NULL, NULL); ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &found, 0, NULL, NULL); if(ret) clut_check_err(ret, "Fallimento nel leggere se la password e' stata trovata con successo"); debug("HOST", "La password e' stata trovata dal kernel OpenCL? "); /*int i; for(i=0; i<AVAILABLE_CORES; i++){ printf(" %d ", found[i]); } printf("\n");*/ if(found){ ret = clEnqueueReadBuffer(dev.queue, dcracked, CL_TRUE, 0, HASH_SIZE, digest, 0, NULL, NULL); if(ret) clut_check_err(ret, "Fallimento nel leggere la password"); printf("Si. Password: %s\n", (char *)digest); } else printf("No.\n"); /* ------------------------------------- Return kernel execution time ---------------------------------------- */ td = clut_get_duration(evt); debug("HOST","Kernel duration: %f secs\n", td); /* ----------------------------------------------- Clean up -------------------------------------------------- */ ret = clReleaseKernel(kernel); ret |= clReleaseMemObject(dchunk); ret |= clReleaseMemObject(dhash); ret |= clReleaseMemObject(charset); ret |= clReleaseMemObject(charset_size); ret |= clReleaseMemObject(dpasslen); ret |= clReleaseMemObject(sync); ret |= clReleaseMemObject(dcracked); ret |= clReleaseMemObject(computed_hash); clut_check_err(ret, "Rilascio di risorse fallito"); clFinish(dev.queue); clut_close_device(&dev); return 0; }
int main(int argc, char** argv) { cl_platform_id pf[MAX_PLATFORMS]; cl_uint nb_platforms = 0; cl_int err; // error code returned from api calls cl_device_type device_type = CL_DEVICE_TYPE_ALL; // Filter args // argv++; while (argc > 1) { if(!strcmp(*argv, "-g") || !strcmp(*argv, "--gpu-only")) { if(device_type != CL_DEVICE_TYPE_ALL) error("--gpu-only and --cpu-only can not be specified at the same time\n"); device_type = CL_DEVICE_TYPE_GPU; } else if(!strcmp(*argv, "-c") || !strcmp(*argv, "--cpu-only")) { if(device_type != CL_DEVICE_TYPE_ALL) error("--gpu-only and --cpu-only can not be specified at the same time\n"); device_type = CL_DEVICE_TYPE_CPU; } else if(!strcmp(*argv, "-s") || !strcmp(*argv, "--size")) { unsigned i; int r; char c; r = sscanf(argv[1], "%u%[mMkK]", &SIZE, &c); if (r == 2) { if (c == 'k' || c == 'K') SIZE *= 1024; else if (c == 'm' || c == 'M') SIZE *= 1024 * 1024; } argc--; argv++; } else break; argc--; argv++; } if(argc > 1) TILE = atoi(*argv); // Get list of OpenCL platforms detected // err = clGetPlatformIDs(3, pf, &nb_platforms); check(err, "Failed to get platform IDs"); printf("%d OpenCL platforms detected\n", nb_platforms); // For each platform do // for (cl_int p = 0; p < nb_platforms; p++) { cl_uint num; int platform_valid = 1; char name[1024], vendor[1024]; cl_device_id devices[MAX_DEVICES]; cl_uint nb_devices = 0; cl_context context; // compute context cl_program program; // compute program cl_kernel kernel; err = clGetPlatformInfo(pf[p], CL_PLATFORM_NAME, 1024, name, NULL); check(err, "Failed to get Platform Info"); err = clGetPlatformInfo(pf[p], CL_PLATFORM_VENDOR, 1024, vendor, NULL); check(err, "Failed to get Platform Info"); printf("Platform %d: %s - %s\n", p, name, vendor); // Get list of devices // err = clGetDeviceIDs(pf[p], device_type, MAX_DEVICES, devices, &nb_devices); printf("nb devices = %d\n", nb_devices); if(nb_devices == 0) continue; // Create compute context with "device_type" devices // context = clCreateContext (0, nb_devices, devices, NULL, NULL, &err); check(err, "Failed to create compute context"); // Load program source into memory // const char *opencl_prog; opencl_prog = file_load(KERNEL_FILE); // Attach program source to context // program = clCreateProgramWithSource(context, 1, &opencl_prog, NULL, &err); check(err, "Failed to create program"); // Compile program // { char flags[1024]; sprintf (flags, "-cl-mad-enable -cl-fast-relaxed-math -DSIZE=%d -DTILE=%d -DTYPE=%s", SIZE, TILE, "float"); err = clBuildProgram (program, 0, NULL, flags, NULL, NULL); if(err != CL_SUCCESS) { size_t len; // Display compiler log // clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &len); { char buffer[len+1]; fprintf(stderr, "--- Compiler log ---\n"); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "%s\n", buffer); fprintf(stderr, "--------------------\n"); } if(err != CL_SUCCESS) error("Failed to build program!\n"); } } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, KERNEL_NAME, &err); check(err, "Failed to create compute kernel"); // Allocate and initialize input data // alloc_buffers_and_user_data(context); // Iterate over devices // for(cl_int dev = 0; dev < nb_devices; dev++) { cl_command_queue queue; char name[1024]; cl_device_type dtype; err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, 1024, name, NULL); check(err, "Cannot get type of device"); err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &dtype, NULL); check(err, "Cannot get type of device"); printf("\tDevice %d : %s [%s]\n", dev, (dtype == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU", name); // Create a command queue // queue = clCreateCommandQueue(context, devices[dev], CL_QUEUE_PROFILING_ENABLE, &err); check(err,"Failed to create command queue"); // Write our data set into device buffer // send_input(queue); // Execute kernel // { cl_event prof_event; cl_ulong start, end; struct timeval t1,t2; double timeInMicroseconds; size_t global[2] = { SIZE, SIZE }; // global domain size for our calculation size_t local[2] = { TILE, TILE }; // local domain size for our calculation printf("\t%dx%d Threads in workgroups of %dx%d\n", global[0], global[1], local[0], local[1]); // Set kernel arguments // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buffer); check(err, "Failed to set kernel arguments"); gettimeofday (&t1, NULL); for (unsigned iter = 0; iter < ITERATIONS; iter++) { err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &prof_event); check(err, "Failed to execute kernel"); } // Wait for the command commands to get serviced before reading back results // clFinish(queue); gettimeofday (&t2,NULL); // Check performance // timeInMicroseconds = (double)TIME_DIFF(t1, t2) / ITERATIONS; printf("\tComputation performed in %lf µs over device #%d\n", timeInMicroseconds, dev); clReleaseEvent(prof_event); } // Read back the results from the device to verify the output // retrieve_output(queue); // Validate computation // check_output_data(); clReleaseCommandQueue(queue); } // Cleanup // free_buffers_and_user_data(); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseContext(context); } return 0; }
int main(int argc, const char * argv[]) { //First we set the variables for measuring performance. struct timeval tim1, tim2; uint64_t time; //Calling the function "gettimeofday" to measure the time before the program executes. gettimeofday(&tim1, NULL); /* * These are the declarations of the OpenCL structures are described below: * cl_platform-id - Stores the types of platforms installed on the host. * cl_device_id - Stores the type of the device (GPU, CPU, etc.) * cl_context - Stores the context in which a command queue can be created. * cl_command_queue - Stores the command queue which governs how the GPU will * will execute the kernel. * cl_program - Stores the kernel code (which can be comprised of several kernels). Is compiled later its * functions get packaged into kernels. * cl_kernel - The OpenCL data structure that represents kernels. */ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; //A cl_int used to store error flags that are returned if OpenCL function does not execute properly. cl_int err; /* * A file object and buffers used to store the input kernel code as well as allocate the memory for the kernel code * and the output log from the compiler during the compilation of the kernel code. */ FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; //The number of work items in each dimension of the data. size_t work_units_per_kernel; //This value determines the size of the nxn (square) array. int n = 1000; //Allocating the memory for the nxn arrays of floats. float **h_xx = (float**)malloc(sizeof(float*)*n); float **h_yy = (float**)malloc(sizeof(float*)*n); float **h_zz = (float**)malloc(sizeof(float*)*n); for(int i = 0; i<n; i++){ h_xx[i] = (float*)malloc(sizeof(float)*n); h_yy[i] = (float*)malloc(sizeof(float)*n); h_zz[i] = (float*)malloc(sizeof(float)*n); //Initializing the arrays. for(int j = 0; j<n; j++){ h_xx[i][j] = i+j; h_yy[i][j] = i+j; } } /* * These three variables of the type cl_mem (memory object) are used as buffers and hold the data which will * be sent to the device and then once calculated sent back to the host. */ cl_mem d_xx; cl_mem d_yy; cl_mem d_zz; // Obtains the Platform information installed on the host and stores into the memory location of the variable "platform" err = clGetPlatformIDs(1, &platform, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Failed to locate Platform." << std::endl; exit(1); } // Obtains the device information (looking for specifically GPU devices) and stores it into the memory location of the variable "device" err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err != CL_SUCCESS){ printf("Error: Failed to locate Device."); exit(1); } // Creates a context on the device and stores it into the "context" variable. context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create context." << std::endl; exit(1); } /* * The following code stores the file "arraySum.cl" into the FILE object "program_handle". It then determines the size * of the file and reads the content into the variable "program_buffer". */ program_handle = fopen("flopstestloop.cl", "r"); if(!program_handle){ std::cout << "Error: Failed to Load Kernel" << std::endl; exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)malloc(program_size + 1); program_buffer[program_size] = '\0'; fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); // Stores the kernel code into a program and stores it into the "program" variable. program = clCreateProgramWithSource(context, 1, (const char **)&program_buffer, (const size_t *)&program_size, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the program" << std::endl; exit(1); } free(program_buffer); //Compiles the program and stores the compiled code into the argument "program" err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not compile the program" << std::endl; /* * The following code first allocates the correct amount of memory in order to store the output of the compilers * build log and then it stores this log into the buffer "program_log". Finally it prints this buffer to the * screen. */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*)malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } //From the compiled code in the program creates a kernel called "arraysum" kernel = clCreateKernel(program, "arraysum", &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the kernel" << std::endl; exit(1); } //Creates a command queue and stores it into the variable "queue". queue = clCreateCommandQueue(context, device, 0, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the queue" << std::endl; exit(1); } //Creating the Device memory buffers. These will be used to transfer data from the host to the device and vice versa. d_xx = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the buffer d_xx" << std::endl; exit(1); } d_yy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the buffer d_yy" << std::endl; exit(1); } d_zz = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the buffer d_zz" << std::endl; exit(1); } /* * This for loop loops over the each row in the matrices x and y first writes the row to the device memory where * the kernel arguments are then set and then then passed to the compiled kernel code already located on the device. * Once executed, the results are then stored in the d_zz buffer and are read back to the host. */ for(int i = 0; i<n; i++) { //Writing the data from the host to the device err = clEnqueueWriteBuffer(queue, d_xx, CL_TRUE, 0, sizeof(float)*n, h_xx[i], 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not write to buffer d_xx" << std::endl; exit(1); } err = clEnqueueWriteBuffer(queue, d_yy, CL_TRUE, 0, sizeof(float)*n, h_yy[i], 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not write to buffer d_yy" << std::endl; exit(1); } //Setting the Kernel Arguments err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_xx); if(err != CL_SUCCESS){ std::cout << "Error: Could not set kernel argument h_xx." << std::endl; exit(1); } err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_yy); if(err != CL_SUCCESS){ std::cout << "Error: Could not set kernel argument h_yy." << std::endl; exit(1); } err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_zz); if(err != CL_SUCCESS){ std::cout << "Error: Could not set kernel argument h_zz." << std::endl; } work_units_per_kernel = n; //Executing the Kernel err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not execute kernel." << std::endl; exit(1); } //Reading the Data from the Kernel err = clEnqueueReadBuffer(queue, d_zz, CL_TRUE, 0, n*(sizeof(float)), h_zz[i], 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not read data from kernel." << std::endl; exit(1); } } //Measuring the time after the OpenCL code has executed and has been copied back to the host. gettimeofday(&tim2, NULL); //Finding the difference between the two measured times. time = tim2.tv_sec - tim1.tv_sec; //Displaying the elapsed time in seconds. std::cout << time + (tim2.tv_usec - tim1.tv_usec)/1000000.00 << std::endl; //The previously allocated memory is freed. clReleaseMemObject(d_xx); clReleaseMemObject(d_yy); clReleaseMemObject(d_zz); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main() { // Initiating opencl cl_device_id device_id; cl_int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 1, &device_id, NULL); if (err != CL_SUCCESS) { std::cout<<"Error in device."<<std::endl; return EXIT_FAILURE; } cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { std::cout<<"Error in context."<<std::endl; return EXIT_FAILURE; } cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { std::cout<<"Error in command queue."<<std::endl; return EXIT_FAILURE; } std::ifstream in("transpMatrix.cl"); std::string contents((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>()); const char* kernelSource = contents.c_str(); cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err); if (!program) { std::cout<<"Error in program."<<std::endl; return EXIT_FAILURE; } err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; std::cout<<"Error in compiling the opencl program."<<std::endl; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); std::cout<<buffer<<std::endl; return EXIT_FAILURE; } cl_kernel kernel = clCreateKernel(program, "simplecl", &err); if (!kernel || err != CL_SUCCESS) { std::cout<<"Error in kernel "<<err<<std::endl; return EXIT_FAILURE; } // Data to compute float* data = new float[count*count]; for(int i = 0; i < count; ++i) { for(int j = 0; j < count; ++j) { data[i*count+j] = rand()%10; std::cout<<data[i*count+j]<<" "; } std::cout<<std::endl; } std::cout<<std::endl; // Creating communication buffers cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count*count, NULL, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count*count, NULL, NULL); if (!input || !output) { std::cout<<"Error in allocation."<<std::endl; return EXIT_FAILURE; } // Copy data to input buffer err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { std::cout<<"Error in copy."<<std::endl; return EXIT_FAILURE; } err = 0; err = clSetKernelArg(kernel, 0, sizeof(int), &count); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); if (err != CL_SUCCESS) { std::cout<<"Error in argument."<<std::endl; return EXIT_FAILURE; } size_t local[] = {1,1}; size_t global[] = {10,10}; // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); // if (err != CL_SUCCESS) // { // std::cout<<"Error in getting loal."<<std::endl; // return EXIT_FAILURE; // } err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { std::cout<<"Error in pushing to queue "<<err<<std::endl; return EXIT_FAILURE; } clFinish(commands); // Is done now err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL ); if (err != CL_SUCCESS) { std::cout<<"Error in reading back."<<std::endl; return EXIT_FAILURE; } for(int i = 0; i < count; ++i) { for(int j = 0; j < count; ++j) { std::cout<<data[i*count+j]<<" "; } std::cout<<std::endl; } std::cout<<std::endl; return 0; }
void vectorVectorAdditionGMDP (cl_uint numDevices,cl_device_id *devices, cl_program program,cl_context context,double * h_VectA,double *h_VectB, double *h_Output,int vectSize) { cl_event gpuExec[1]; cl_int err; cl_command_queue cmdQueue; //holds command queue object cl_kernel kernel; //holds kernel object cl_mem d_VectA,d_VectB,d_Output; //holds device input output buffer cl_event events; // events size_t globalWorkSize[2]={vectSize,vectSize}; //holds global group size double gflops=0.0; //holds total achieved gflops cl_ulong startTime, endTime,elapsedTime; //holds time float executionTimeInSeconds; //holds total execution time /*create command queue*/ cmdQueue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err); if( err != CL_SUCCESS || cmdQueue == 0) { printf("\n\t Failed to create command queue \n" ); exit (-1); } /*create kernel object*/ kernel = clCreateKernel(program,"VectVectAddDPKernel",&err); OPENCL_CHECK_STATUS("error while creating kernel",err); /*create buffer*/ d_VectA=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectA,&err); OPENCL_CHECK_STATUS("error while creating buffer for input",err); d_VectB=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectB,&err); OPENCL_CHECK_STATUS("error while creating buffer for input",err); d_Output=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(double)*vectSize,NULL,&err); OPENCL_CHECK_STATUS("error while creating buffer for d_Output",err); /*set kernel arg*/ err=clSetKernelArg(kernel,0,sizeof(cl_mem),&d_VectA); OPENCL_CHECK_STATUS("error while setting arg 0",err); err=clSetKernelArg(kernel,1,sizeof(cl_mem),&d_VectB); OPENCL_CHECK_STATUS("error while setting arg 1",err); err=clSetKernelArg(kernel,2,sizeof(cl_mem),&d_Output); OPENCL_CHECK_STATUS("error while setting arg 2",err); /*load kernel*/ err = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL,globalWorkSize,NULL,0,NULL,&gpuExec[0]); OPENCL_CHECK_STATUS("error while creating ND range",err); //completion of all commands to command queue err = clFinish(cmdQueue); OPENCL_CHECK_STATUS("clFinish",err); /* calculate start time and end time*/ clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); /* total elapsed time*/ elapsedTime = endTime-startTime; /*total execution time*/ executionTimeInSeconds = (float)(1.0e-9 * elapsedTime); /* reading buffer object*/ err = clEnqueueReadBuffer(cmdQueue,d_Output,CL_TRUE,0,sizeof(cl_double)*vectSize,h_Output,0,0,&events); OPENCL_CHECK_STATUS("error while reading buffer",err); /* calculate total gflops*/ gflops= (1.0e-9 * (( vectSize) / executionTimeInSeconds)); // Print the gflops on the screen print_on_screen("Vector Vector Addition double precision using global memory",executionTimeInSeconds,vectSize,gflops,1); //check results vectVectAddCheckResultGMDP(h_VectA,h_VectB,h_Output,vectSize); //release opencl objects clReleaseMemObject(d_VectA); clReleaseMemObject(d_VectB); clReleaseMemObject(d_Output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(cmdQueue); clReleaseContext(context); }
int main(int argc, char **argv) { cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err)); const char *program_source[] = { "__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n", "{\n", " int i = get_global_id(0);\n", " dst[i] = src[i] * factor;\n", "}\n" }; cl_program program; program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err)); if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) { char buffer[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "CL Compilation failed:\n%s", buffer); abort(); } CL_CHECK(clUnloadCompiler()); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); int factor = 2; cl_kernel kernel; kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err)); for (int i=0; i<NUM_DATA; i++) { CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { int data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL)); printf(" %d", data); } printf("\n"); CL_CHECK(clReleaseMemObject(input_buffer)); CL_CHECK(clReleaseMemObject(output_buffer)); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
void OpenCLExecuter::ocl_filter(int src_chan) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process //temp array unsigned char* data = new unsigned char[iNumElements]; // set Local work size dimensions //szLocalWorkSize = 256; // set Global work size dimensions //szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize; //szGlobalWorkSize = iNumElements; // printf("OPENCL: number of elements: %d\n", (int)iNumElements); // printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize); // printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize); // printf("OPENCL: work groups: %d\n", (int)((float)szGlobalWorkSize/(float)szLocalWorkSize)); size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end //Prepare data to upload for(int j=0; j<iNumElements; j++) data[j] = volobj->texture3d[3*j+src_chan]; // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Write data from host to GPU // err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL); // printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host // err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL); // printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); //read data back for(int i=0; i<iNumElements; i++) { if(volobj->is_greyscale) volobj->texture3d[3*i+0] = volobj->texture3d[3*i+1] = volobj->texture3d[3*i+2] = data[i]; else volobj->texture3d[3*i+src_chan] = data[i]; } // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); delete[] data; }
int Parallel::setup() { /** * OpenCL initialization. */ cl_int status = Simulator::setup(); CheckStatus(status, "Simulator::setup() failed."); cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); CheckStatus(status, "clGetPlatformIDs, fetching number"); DEBUG_STDOUT("Number of platforms: " << numPlatforms); cl_platform_id platform = NULL; if (numPlatforms > 0) { std::unique_ptr<cl_platform_id[]> platforms (new cl_platform_id[numPlatforms]); status = clGetPlatformIDs(numPlatforms, platforms.get(), NULL); CheckStatus(status, "clGetPlatformIDs, fetching platforms"); for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); CheckStatus(status, "clGetPlatformInfo"); } // Just grab the first platform. platform = platforms[0]; } CheckConditional(platform != NULL, "platform == NULL"); cl_uint numDevices; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); CheckStatus(status, "clGetDeviceIDs: fetching number"); DEBUG_STDOUT("Number of devices: " << numDevices); cl_device_id *devices = new cl_device_id[numDevices]; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); CheckStatus(status, "clGetDeviceIDs: fetching devices"); int deviceIndex = 0; for (unsigned i = 0; i < numDevices; ++i) { char pbuf[100]; status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (!strncmp(pbuf, "ATI", 3)) { deviceIndex = i; } } /* Create the context. */ context = clCreateContext(0, numDevices, devices, NULL, NULL, &status); CheckConditional(context != NULL, "clCreateContextFromType"); /* Create command queue */ cl_command_queue_properties prop = CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[deviceIndex], prop, &status); CheckStatus(status, "clCreateCommandQueue"); /* Create a CL program using the kernel source */ SDKFile kernelFile; std::string kernelPath = getenv("HOME") + std::string("/md-simulator/src/TestKernel.cl"); if(!kernelFile.open(kernelPath.c_str())) { DEBUG_STDERR("Failed to load kernel file : " << kernelPath); return MD_FAILURE; } const char *source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); CheckStatus(status, "clCreateProgramWithSource"); /* Create a cl program executable for all the devices specified */ status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); if (status != CL_SUCCESS) { if (status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; std::unique_ptr<char[]> buildLog (nullptr); //char *buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(program, devices[deviceIndex], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog.get(), &buildLogSize); CheckStatus(logStatus, "clGetProgramBuildInfo"); buildLog = std::unique_ptr<char[]>(new char[buildLogSize]); if(!buildLog) { return MD_FAILURE; } std::fill_n(buildLog.get(), buildLogSize, 0); logStatus = clGetProgramBuildInfo(program, devices[deviceIndex], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog.get(), NULL); CheckStatus(logStatus, "clGetProgramBuildInfo (2)"); DEBUG_STDERR("\n\t\t\tBUILD LOG\n"); DEBUG_STDERR("************************************************\n"); DEBUG_STDERR(buildLog.get()); DEBUG_STDERR("************************************************\n"); } } CheckStatus(status, "clBuildProgram"); /* Get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel(program, "computeAccelerations", &status); CheckStatus(status, "clCreateKernel"); /* Check group size against group size returned by kernel */ status = clGetKernelWorkGroupInfo(kernel, devices[deviceIndex], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); CheckStatus(status, "clGetKernelWorkGroupInfo"); DEBUG_STDOUT("kernelWorkGroupSize: " << kernelWorkGroupSize); /** * Initialize some simulator data structures. */ global = particleCount * particleCount; local = particleCount; if (global * local > kernelWorkGroupSize) { DEBUG_STDERR("WARNING - global * local > kernelWorkGroupSize; global: " << global << ", local: " << local << ", kernelWorkGroupSize: " << kernelWorkGroupSize); return MD_FAILURE; } // Data holds the molecule positions. data = std::unique_ptr<float[]> (new float[particleCount * 3]); // Constants holds simulator constants. constants = std::unique_ptr<float[]> (new float[NUM_CONSTANTS]); // Copy constants to buffer; constants[0] = epsilon; constants[1] = sigma; constants[2] = negForceCutoffMinusHalf; constants[3] = forceCutoffMinusHalf; constants[4] = wallStiffness; // Results holds pairwise forces. results = std::unique_ptr<float[]> (new float[particleCount * particleCount * 3]); return MD_SUCCESS; }
void OpenCLExecuter::ocl_filterPeronaMalik(float lambda, float dT, unsigned char* src_array, unsigned char* dst_array, int w, int h, int d) { float lambda2 = lambda*lambda; cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel int iNumElements = w*h*d; // Length of float arrays to process size_t global_threads[3] ={w,h,d}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(float), (void*)&lambda2); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(float), (void*)&dT); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texwidth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&volobj->texheight); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&volobj->texdepth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); }
int main(void) { //time meassuring struct timeval tvs; struct timeval tve; float elapsedTime; int Nx; int Ny; int Nz; int N; int plotnum=0; int Tmax=0; int plottime=0; int plotgap=0; float Lx,Ly,Lz; float dt=0.0; float A=0.0; float B=0.0; float Du=0.0; float Dv=0.0; float a[2]={1.0,0.0}; float b[2]={0.5,0.0}; float* x,*y,*z ; float* u[2],*v[2]; //openCL variables cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem cl_u[2] = {NULL,NULL}; cl_mem cl_v[2] = {NULL,NULL}; cl_mem cl_uhat[2] = {NULL,NULL}; cl_mem cl_vhat[2] = {NULL,NULL}; cl_mem cl_x = NULL; cl_mem cl_y = NULL; cl_mem cl_z = NULL; cl_mem cl_kx = NULL; cl_mem cl_ky = NULL; cl_mem cl_kz = NULL; cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL; cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, 0, &ret); size_t source_size; char *source_str; //end opencl int i,n; int status=0; //int start, finish, count_rate, ind, numthreads char nameconfig[100]=""; //Read infutfile char InputFileName[]="./INPUTFILE"; FILE*fp; fp=fopen(InputFileName,"r"); if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);} int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B); if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);} fclose(fp); printf("NX %d\n",Nx); printf("NY %d\n",Ny); printf("NZ %d\n",Nz); printf("Tmax %d\n",Tmax); printf("plotgap %d\n",plotgap); printf("Lx %f\n",Lx); printf("Ly %f\n",Ly); printf("Lz %f\n",Lz); printf("dt %f\n",dt); printf("Du %f\n",Du); printf("Dv %f\n",Dv); printf("F %f\n",A); printf("k %f\n",B); printf("Read inputfile\n"); N=Nx*Ny*Nz; plottime=plotgap; B=A+B; //ALLocate the memory u[0]=(float*) malloc(N*sizeof(float)); v[0]=(float*) malloc(N*sizeof(float)); x=(float*) malloc(Nx*sizeof(float)); y=(float*) malloc(Ny*sizeof(float)); z=(float*) malloc(Nz*sizeof(float)); //allocate gpu mem cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); printf("allocated space\n"); // FFT library realted declarations. clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {Nx, Ny, Nz}; // Setup clFFT. clfftSetupData fftSetup; ret = clfftInitSetupData(&fftSetup); ret = clfftSetup(&fftSetup); // Create a default plan for a complex FFT. ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths); // Set plan parameters. ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR); ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE); // Bake the plan. ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL); // Create temporary buffer. cl_mem tmpBufferu = 0; cl_mem tmpBufferv = 0; // Size of temp buffer. size_t tmpBufferSize = 0; status = clfftGetTmpBufSize(planHandle, &tmpBufferSize); if ((status == 0) && (tmpBufferSize > 0)) { tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); if (ret != CL_SUCCESS) printf("Error with tmpBuffer clCreateBuffer\n"); } //kernel grid fp = fopen("./grid.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL); grid = clCreateKernel(p_grid, "grid", &ret); //first x cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx); size_t global_work_size_x[3] = {Nx, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny); size_t global_work_size_y[3] = {Ny, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz); size_t global_work_size_z[3] = {Nz, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid); //kernel initial data fp = fopen("./initialdata.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL); initialdata = clCreateKernel(p_initialdata, "initialdata", &ret); ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x); ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y); ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z); ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz); size_t global_work_size[3] = {N, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata); ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseMemObject(cl_x); ret = clReleaseMemObject(cl_y); ret = clReleaseMemObject(cl_z); //write to disk fp=fopen("./data/xcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); } for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);} fclose( fp ); fp=fopen("./data/ycoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); } for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);} fclose( fp ); fp=fopen("./data/zcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); } for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);} fclose( fp ); free(x); free(y); free(z); n=0; plotnum=0; //output of initial data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); //frequencies kernel fp = fopen("./frequencies.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL); frequencies = clCreateKernel(p_frequencies, "frequencies", &ret); //get frequencies first x cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); printf("Setup grid, fourier frequencies and initialcondition\n"); //load the rest of the kernels //linearpart kernel fp = fopen("./linearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL); linearpart = clCreateKernel(p_linearpart, "linearpart", &ret); //kernel nonlinear fp = fopen("./nonlinearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL); nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret); printf("Got initial data, starting timestepping\n"); gettimeofday(&tvs, NULL); for(n=0;n<=Tmax;n++){ //linear ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); //nonlinearpart ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt); ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]); ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // linear part ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d\n",n*dt,n,plotnum); plottime=plottime+plotgap; plotnum=plotnum+1; ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); //output of data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); } } gettimeofday(&tve, NULL); printf("Finished time stepping\n"); elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0; // sec to ms elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0; // us to ms printf("%f,",elapsedTime); clReleaseMemObject(cl_u[0]); clReleaseMemObject(cl_u[1]); clReleaseMemObject(cl_v[0]); clReleaseMemObject(cl_v[1]); clReleaseMemObject(cl_uhat[0]); clReleaseMemObject(cl_uhat[1]); clReleaseMemObject(cl_vhat[0]); clReleaseMemObject(cl_vhat[1]); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); clReleaseMemObject(cl_kz); ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart); ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart); free(u[0]); free(v[0]); clReleaseMemObject(tmpBufferu); clReleaseMemObject(tmpBufferv); /* Release the plan. */ ret = clfftDestroyPlan(&planHandle); /* Release clFFT library. */ clfftTeardown(); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); printf("Program execution complete\n"); return 0; }
void OpenCLExecuter::ocl_filterGaussian(unsigned char* src_array, unsigned char* dst_array, int w, int h, int d) { // printf("gaussian_sum: %f\n", gaussian_sum); printf("gaussian_width: %d\n", filter_width); printf("gaussian_mask size: %d\n", filter_kernel.size()); cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem gauss_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel int iNumElements = w*h*d; // Length of float arrays to process size_t global_threads[3] ={w,h,d}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); gauss_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(float) * filter_kernel.size(), NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); //================================================== // X axis //================================================== // Create the kernel ckKernel = clCreateKernel (cpProgram, "gaussianX", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); /*size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);*/ // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host //err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //================================================== // Y axis //================================================== // Create the kernel ckKernel = clCreateKernel (cpProgram, "gaussianY", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); /* size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); */ // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU //err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host //err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //================================================== // Z axis //================================================== // Create the kernel ckKernel = clCreateKernel (cpProgram, "gaussianZ", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); /*size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); */ // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end //Prepare data to upload //for(int j=0; j<iNumElements; j++) // data[j] = volobj->texture3d[3*j+0]; // Write data from host to GPU //err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); if(gauss_buffer)clReleaseMemObject(gauss_buffer); }
void DarkenManager::setupCLprog() { cl_int errNum; std::string fileName("darken.cl"); fileName = CL_LOC + fileName; std::ifstream file(fileName.c_str() ); if (!file) { std::stringstream s; s << "error opening " << fileName << " in " << __FILE__ << " at " << __LINE__ << std::endl; throw std::runtime_error(s.str()); } std::string source = std::string(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>()); const char *souce_sting=source.c_str(); m_darken_program=clCreateProgramWithSource(m_context, 1, &souce_sting, NULL, &errNum); ASSERT_CL(errNum); errNum=clBuildProgram(m_darken_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (errNum==CL_BUILD_PROGRAM_FAILURE) { size_t logSize; errNum = clGetProgramBuildInfo(m_darken_program, m_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); ASSERT_CL(errNum); std::vector<char> log_vec; log_vec.resize(logSize); errNum = clGetProgramBuildInfo(m_darken_program, m_id, CL_PROGRAM_BUILD_LOG, logSize, &log_vec[0], &logSize); ASSERT_CL(errNum); std::string log(&log_vec[0]); std::string errMessage("Error compiling: "); errMessage += fileName; errMessage += "\n"; errMessage += log; std::cout << errMessage << std::endl; throw std::runtime_error(errMessage); } ASSERT_CL(errNum); // m_darken_kernel = clCreateKernel(m_darken_program, "darken", &errNum); m_darken_kernel = clCreateKernel(m_darken_program, "average_NO_LOCAL", &errNum); // m_darken_kernel = clCreateKernel(m_darken_program, "average", &errNum); ASSERT_CL(errNum); errNum = clSetKernelArg(m_darken_kernel, 0, sizeof(m_cl_src_buffer), &m_cl_src_buffer); errNum |= clSetKernelArg(m_darken_kernel, 1, sizeof(m_cl_dst_buffer), &m_cl_dst_buffer); cl_int w = WINDOW_SIZE_WIDTH; cl_int h = WINDOW_SIZE_HEIGHT; errNum |= clSetKernelArg(m_darken_kernel, 2, sizeof(cl_int), &w); errNum |= clSetKernelArg(m_darken_kernel, 3, sizeof(cl_int), &h); errNum |= clSetKernelArg(m_darken_kernel, 4, (WORKGROUP_DIM_X+2)*(WORKGROUP_DIM_Y+2)*4, NULL); ASSERT_CL(errNum); }
int main() { // Create the variables for the time measure int starttime, stoptime; //Get initial time starttime = GetTimeMs(); // This code executes on the OpenCL host // Host data float *A=NULL; // Input array float *B=NULL; // Input array float *C=NULL; // Output array // Elements in each array const int elements=2048; // Compute the size of the data size_t datasize=sizeof(int)*elements; // Allocate space for input/output data A=(float*)malloc(datasize); B=(float*)malloc(datasize); C=(float*)malloc(datasize); // Initialize the input data A[0]=2.2; A[1]=1.3; B[0]=3.7; B[1]=5.4; // Load the kernel source code into the array programSource FILE *fp; char *programSource; size_t programSize; fp = fopen("fplos_kernels.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } programSource = (char*)malloc(MAX_SOURCE_SIZE); fclose( fp ); // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms=0; status=clGetPlatformIDs(0, NULL,&numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms=NULL; platforms=(cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices=0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL,&numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A cl_mem bufA; bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will contain the data // from the host array B cl_mem bufB; bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will hold the output data cl_mem bufC; bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); // Write input array A to the device buffer bufferA status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 0, datasize, A, 0, NULL, NULL); // Write input array B to the device buffer bufferB status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 0, datasize, B, 0, NULL, NULL); // Create a program with source code cl_program program=clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status); // Build (compile) the program for the device status=clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); // Create the vector addition kernel cl_kernel kernel; kernel=clCreateKernel(program, "floatadd", &status); // Associate the input and output buffers with the kernel status=clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA); status=clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB); status=clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC); // Define an index space (global work size) of work // items for execution. A workgroup size (local work size) // is not required, but can be used. size_t globalWorkSize[1]; // There are 'elements' work-items globalWorkSize[0]=elements; // Execute the kernel for execution status=clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, datasize, C, 0, NULL, NULL); printf("Output = %.1f\n", C[0]); printf("Output = %.1f\n", C[1]); // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC); clReleaseContext(context); // Free host resources free(A); free(B); free(C); free(platforms); free(devices); //Get initial time stoptime = GetTimeMs(); printf("Duration= %d ms\n", stoptime - starttime); return 0; }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Output information err = output_device_info(device_id); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); if (!kernel_pi || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err)); return EXIT_FAILURE; } // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nwork_groups * work_group_size; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
void spmv_csr_cpu(const csr_matrix* csr,const float* x,const float* y,float* out) { int num_rows = csr->num_rows; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { fprintf(stderr, "ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_csr_src = "csr_ocl"; char * tempchar = "./spmv_kernel.cl"; FILE * fp = fopen(tempchar, "rb"); if(!fp) { fprintf(stderr, "ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); int use_gpu = 1; if(initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = { source, 0 }; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel_csr; kernel_csr = clCreateKernel(prog, kernel_csr_src, &err); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateKernel() 0 => %d\n", err); return -1; } clReleaseProgram(prog); cl_mem memAp; cl_mem memAj; cl_mem memAx; cl_mem memx; cl_mem memy; memAp = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*(csr.num_rows+1), NULL, &err); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memAj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*csr.num_nonzeros, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memAx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*csr.num_rows, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} //write buffers err = clEnqueueWriteBuffer(cmd_queue, memAp, CL_FALSE, 0, sizeof(unsigned int)*csr.num_rows+4, csr->Ap, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memAj, CL_FALSE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr->Aj, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memAx, CL_FALSE, 0, sizeof(float)*csr.num_nonzeros, csr->Ax, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memx, CL_FALSE, 0, sizeof(float)*csr.num_cols, x, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memy, CL_FALSE, 0, sizeof(float)*csr.num_rows, y, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } clSetKernelArg(kernel_csr, 0, sizeof(unsigned int *), (unsigned int *) &csr->num_rows); clSetKernelArg(kernel_csr, 1, sizeof(void *), (void*) &memAp); clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memAj); clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memAx); clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memx); clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memy); err = clEnqueueNDRangeKernel(cmd_queue, kernel_csr, 2, NULL, global_work, local_work, 0, 0, 0); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, memy, 1, 0, sizeof(float)*csr.num_rows, out, 0, 0, 0); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1 clEnqueueReadBuffer: out\n"); return -1; } clReleaseMemObject(memAp); clReleaseMemObject(memAj); clReleaseMemObject(memAx); clReleaseMemObject(memx); clReleaseMemObject(memy); }
void buildOpenCLKernels_update_halo_kernel1_fr2(int xdim0, int ydim0, int xdim1, int ydim1, int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, int ydim5, int xdim6, int ydim6) { // int ocl_fma = OCL_FMA; if (!isbuilt_update_halo_kernel1_fr2) { buildOpenCLKernels(); // clSafeCall( clUnloadCompiler() ); cl_int ret; char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_fr2.cl"}; // Load the kernel source code into the array source_str FILE *fid; char *source_str[1]; size_t source_size[1]; for (int i = 0; i < 1; i++) { fid = fopen(source_filename[i], "r"); if (!fid) { fprintf(stderr, "Can't open the kernel source file!\n"); exit(1); } source_str[i] = (char *)malloc(4 * 0x1000000); source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); if (source_size[i] != 4 * 0x1000000) { if (ferror(fid)) { printf("Error while reading kernel source file %s\n", source_filename[i]); exit(-1); } if (feof(fid)) printf("Kernel source file %s succesfuly read.\n", source_filename[i]); // printf("%s\n",source_str[i]); } fclose(fid); } printf("Compiling update_halo_kernel1_fr2 %d source -- start \n", OCL_FMA); // Create a program from the source OPS_opencl_core.program = clCreateProgramWithSource( OPS_opencl_core.context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); clSafeCall(ret); // Build the program char buildOpts[255 * 8]; char *pPath = NULL; pPath = getenv("OPS_INSTALL_PATH"); if (pPath != NULL) if (OCL_FMA) sprintf(buildOpts, "-cl-mad-enable -DOCL_FMA -I%s/c/include " "-DOPS_WARPSIZE=%d " "-Dxdim0_update_halo_kernel1_fr2=%d " "-Dydim0_update_halo_kernel1_fr2=%d " "-Dxdim1_update_halo_kernel1_fr2=%d " "-Dydim1_update_halo_kernel1_fr2=%d " "-Dxdim2_update_halo_kernel1_fr2=%d " "-Dydim2_update_halo_kernel1_fr2=%d " "-Dxdim3_update_halo_kernel1_fr2=%d " "-Dydim3_update_halo_kernel1_fr2=%d " "-Dxdim4_update_halo_kernel1_fr2=%d " "-Dydim4_update_halo_kernel1_fr2=%d " "-Dxdim5_update_halo_kernel1_fr2=%d " "-Dydim5_update_halo_kernel1_fr2=%d " "-Dxdim6_update_halo_kernel1_fr2=%d " "-Dydim6_update_halo_kernel1_fr2=%d ", pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); else sprintf(buildOpts, "-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d " "-Dxdim0_update_halo_kernel1_fr2=%d " "-Dydim0_update_halo_kernel1_fr2=%d " "-Dxdim1_update_halo_kernel1_fr2=%d " "-Dydim1_update_halo_kernel1_fr2=%d " "-Dxdim2_update_halo_kernel1_fr2=%d " "-Dydim2_update_halo_kernel1_fr2=%d " "-Dxdim3_update_halo_kernel1_fr2=%d " "-Dydim3_update_halo_kernel1_fr2=%d " "-Dxdim4_update_halo_kernel1_fr2=%d " "-Dydim4_update_halo_kernel1_fr2=%d " "-Dxdim5_update_halo_kernel1_fr2=%d " "-Dydim5_update_halo_kernel1_fr2=%d " "-Dxdim6_update_halo_kernel1_fr2=%d " "-Dydim6_update_halo_kernel1_fr2=%d ", pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); else { sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); exit(EXIT_FAILURE); } ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); if (ret != CL_SUCCESS) { char *build_log; size_t log_size; clSafeCall(clGetProgramBuildInfo( OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); build_log = (char *)malloc(log_size + 1); clSafeCall(clGetProgramBuildInfo( OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); build_log[log_size] = '\0'; fprintf( stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); fprintf(stderr, "\n========================================================= \n"); free(build_log); exit(EXIT_FAILURE); } printf("compiling update_halo_kernel1_fr2 -- done\n"); // Create the OpenCL kernel OPS_opencl_core.kernel[22] = clCreateKernel( OPS_opencl_core.program, "ops_update_halo_kernel1_fr2", &ret); clSafeCall(ret); isbuilt_update_halo_kernel1_fr2 = true; } }
void kernel_gpu_opencl_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem, int order, long maxheight, int count, long *currKnode, long *offset, long *lastKnode, long *offset_2, int *start, int *end, int *recstart, int *reclength) { //======================================================================================================================================================150 // CPU VARIABLES //======================================================================================================================================================150 // timer long long time0; long long time1; long long time2; long long time3; long long time4; long long time5; long long time6; time0 = get_time(); //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I./../"); #ifdef DEFAULT_ORDER_2 sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "findRangeK", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time1 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY MALLOC //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN //====================================================================================================100 //==================================================50 // knodesD //==================================================50 cl_mem knodesD; knodesD = clCreateBuffer( context, CL_MEM_READ_WRITE, knodes_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 cl_mem currKnodeD; currKnodeD = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 cl_mem offsetD; offsetD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // lastKnodeD //==================================================50 cl_mem lastKnodeD; lastKnodeD = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offset_2D //==================================================50 cl_mem offset_2D; offset_2D = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // startD //==================================================50 cl_mem startD; startD = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // endD //==================================================50 cl_mem endD; endD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansDStart //==================================================50 cl_mem ansDStart; ansDStart = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // ansDLength //==================================================50 cl_mem ansDLength; ansDLength = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time2 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN //====================================================================================================100 //==================================================50 // knodesD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue knodesD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to knodes_mem, // size to be copied knodes, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue currKnodeD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied currKnode, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue offsetD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied offset, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // lastKnodeD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue lastKnodeD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied lastKnode, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offset_2D //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue offset_2D, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied offset_2, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // startD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue startD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied start, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // endD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue endD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied end, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansDStart //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue endD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied end, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // ansDLength //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue ansDLength, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied reclength, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time3 = get_time(); //==================================================50 // END //==================================================50 //======================================================================================================================================================150 // KERNEL //======================================================================================================================================================150 //====================================================================================================100 // Execution Parameters //====================================================================================================100 size_t local_work_size[1]; local_work_size[0] = order < 1024 ? order : 1024; size_t global_work_size[1]; global_work_size[0] = count * local_work_size[0]; printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //====================================================================================================100 // Kernel Arguments //====================================================================================================100 clSetKernelArg( kernel, 0, sizeof(long), (void *) &maxheight); clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *) &knodesD); clSetKernelArg( kernel, 2, sizeof(long), (void *) &knodes_elem); clSetKernelArg( kernel, 3, sizeof(cl_mem), (void *) &currKnodeD); clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *) &offsetD); clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *) &lastKnodeD); clSetKernelArg( kernel, 6, sizeof(cl_mem), (void *) &offset_2D); clSetKernelArg( kernel, 7, sizeof(cl_mem), (void *) &startD); clSetKernelArg( kernel, 8, sizeof(cl_mem), (void *) &endD); clSetKernelArg( kernel, 9, sizeof(cl_mem), (void *) &ansDStart); clSetKernelArg( kernel, 10, sizeof(cl_mem), (void *) &ansDLength); //====================================================================================================100 // Kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time4 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY (CONTD.) //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansDStart //==================================================50 error = clEnqueueReadBuffer(command_queue, // The command queue. ansDStart, // The image on the device. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?) 0, // Offset. None in this case. count*sizeof(int), // Size to copy. recstart, // The pointer to the image on the host. 0, // Number of events in wait list. Not used. NULL, // Event wait list. Not used. NULL); // Event object for determining status. Not used. if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // ansDLength //==================================================50 error = clEnqueueReadBuffer(command_queue, // The command queue. ansDLength, // The image on the device. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?) 0, // Offset. None in this case. count*sizeof(int), // Size to copy. reclength, // The pointer to the image on the host. 0, // Number of events in wait list. Not used. NULL, // Event wait list. Not used. NULL); // Event object for determining status. Not used. if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time5 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY DEALLOCATION //======================================================================================================================================================150 // Release kernels... clReleaseKernel(kernel); // Now the program... clReleaseProgram(program); // Clean up the device memory... clReleaseMemObject(knodesD); clReleaseMemObject(currKnodeD); clReleaseMemObject(offsetD); clReleaseMemObject(lastKnodeD); clReleaseMemObject(offset_2D); clReleaseMemObject(startD); clReleaseMemObject(endD); clReleaseMemObject(ansDStart); clReleaseMemObject(ansDLength); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // ...and finally, the queue and context. clReleaseCommandQueue(command_queue); // ??? clReleaseContext(context); time6 = get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 printf("Time spent in different stages of GPU_CUDA KERNEL:\n"); printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100); printf("Total time:\n"); printf("%.12f s\n", (float) (time6-time0) / 1000000); //======================================================================================================================================================150 // END //======================================================================================================================================================150 }
int main(int argc, char** argv) { cl_device_type DEV_TYPE = CL_DEVICE_TYPE_GPU; cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue command_queue; cl_program program; cl_kernel kernel; cl_mem buffer_src; cl_mem buffer_dst; cl_int err; size_t local = 4; size_t global = local * 8; size_t SIZE = global; err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]\n", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } cl_uint num_dev = 1; err = clGetDeviceIDs(platform, DEV_TYPE, num_dev, &device, &num_dev); //printf("clGetDeviceIDs : device = 0x%x\n", device); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]\n", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } if (num_dev < 1) exit(EXIT_FAILURE); int* host_src = (int*) calloc(SIZE, sizeof(int)); for (int i = 0; i < SIZE; i++) { host_src[i] = -666; //i * 10; } int* host_dst = (int*) calloc(SIZE, sizeof(int)); context = clCreateContext(0, num_dev, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } command_queue = clCreateCommandQueue(context, device, 0, &err); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } buffer_src = clCreateBuffer(context, CL_MEM_READ_ONLY, SIZE * sizeof(int), NULL, &err); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } buffer_dst = clCreateBuffer(context, CL_MEM_WRITE_ONLY, SIZE * sizeof(int), NULL, &err); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } size_t kernel_src_len = strlen(kernel_src); program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, &kernel_src_len, &err); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } /*fprintf(stderr, "%s(%d) : size = %d\n", "main", __LINE__, SIZE * sizeof(int)); for(int i=0; i<16; i++){ fprintf(stderr, "%s(%d) : ptr[%d] = %d\n", "main", __LINE__, i, *((int *)host_src + i)); } for(int i=0; i<16; i++){ fprintf(stderr, "%s(%d) : ptr[%d] = %d\n", "main", __LINE__, i, *((int *)host_src + 16 + i)); }*/ err = clEnqueueWriteBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_src, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //err = clEnqueueWriteBuffer(command_queue, buffer_dst, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } kernel = clCreateKernel(program, "sample", &err); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &buffer_dst); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &buffer_src); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } int offset = 100; err = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &offset); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //global = global/2; err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //size_t gwo = global; //err = clEnqueueNDRangeKernel(command_queue, kernel, 1, &gwo, &global, &local, 0, NULL, NULL); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //err = clFinish(command_queue); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //err = clEnqueueReadBuffer(command_queue, buffer_dst, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } err = clEnqueueReadBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } for (int i = 0; i < SIZE; i++) printf("[%2d] %d\n", i, host_dst[i]); //memset(host_src, 0, SIZE*sizeof(int)); //err = clEnqueueWriteBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_src, 0, NULL, NULL); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //err = clEnqueueReadBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //for (int i = 0; i < SIZE; i++) printf("[%2d] %d\n", i, host_dst[i]); //err = clEnqueueReadBuffer(command_queue, buffer_src, CL_TRUE, 0, SIZE * sizeof(int), host_dst, 0, NULL, NULL); //if (err != CL_SUCCESS) { printf("[%s:%d] ERR[%d]", __FILE__, __LINE__, err); exit(EXIT_FAILURE); } //for (int i = 0; i < SIZE; i++) printf("[%2d] %d\n", i, host_dst[i]); free(host_src); free(host_dst); return 0; }