void MFNHashTypePlainOpenCL::copyDeviceFoundPasswordsToHost() { trace_printf("MFNHashTypePlainOpenCL::copyDeviceFoundPasswordsToHost()\n"); cl_int errorCode; errorCode = clEnqueueReadBuffer (this->OpenCL->getCommandQueue(), this->DeviceSuccessAddress, CL_TRUE /* blocking write */, 0 /* offset */, this->activeHashesProcessed.size() /* bytes to copy */, (void *)this->HostSuccessAddress, NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueReadBuffer (this->OpenCL->getCommandQueue(), this->DeviceFoundPasswordsAddress, CL_TRUE /* blocking write */, 0 /* offset */, this->passwordLength * this->activeHashesProcessed.size() /* bytes to copy */, (void *)this->HostFoundPasswordsAddress, NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } }
cl_bool there_was_an_error(cl_int err){ if (err != CL_SUCCESS){ printf("%s\n", print_cl_errstring(err)); return 1; } return 0; }
inline void check_error(cl_int err) { if (err != CL_SUCCESS) { std::cerr << "ERROR: " << " (" << err << ") " << print_cl_errstring(err) << std::endl; exit(EXIT_FAILURE); } }
int errorCodeOut(const cl_int error,const char* ownMessage) { if (error != CL_SUCCESS) { printf("%s\n%s\n", ownMessage, print_cl_errstring(error)); return -1; } return 0; }
void MFNHashTypeSaltedOpenCL_MD5_PS::launchKernel() { trace_printf("MFNHashTypeSaltedOpenCL_MD5_PS::launchKernel()\n"); cl_event kernelLaunchEvent; cl_int errorCode; size_t numberWorkgroups; size_t numberWorkitems; numberWorkgroups = this->GPUBlocks * this->GPUThreads; numberWorkitems = this->GPUThreads; klaunch_printf("T %d: Platform/Device: %d/%d\n", this->threadId, this->openCLPlatformId, this->gpuDeviceId); klaunch_printf("T %d: Workgroups/Workitems: %d/%d\n", this->threadId, numberWorkgroups, numberWorkitems); // Copy the per-step value to the kernel errorCode = clSetKernelArg (this->HashKernel, 14, sizeof(cl_uint), &this->perStep); errorCode |= clSetKernelArg (this->HashKernel, 20, sizeof(cl_uint), &this->saltStartOffset); if (errorCode != CL_SUCCESS) { printf("Error 1: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueNDRangeKernel(this->OpenCL->getCommandQueue(), this->HashKernel, 1 /* numDims */, NULL /* offset */, &numberWorkgroups, &numberWorkitems, NULL, NULL, &kernelLaunchEvent); if (errorCode != CL_SUCCESS) { printf("Error 2: %s\n", print_cl_errstring(errorCode)); exit(1); } if (clWaitForEvents(1, &kernelLaunchEvent) != CL_SUCCESS) { printf("\nError on wait for event!\n"); fflush(stdout); }; // Release the event to prevent memory leaks. clReleaseEvent(kernelLaunchEvent); }
void MFNHashTypeSaltedOpenCL_MD5_PS::copySaltConstantsToDevice() { trace_printf("MFNHashTypeSaltedOpenCL_MD5_PS::copySaltConstantsToDevice()\n"); cl_int errorCode = 0; // Salted hash data uint64_t localNumberSaltValues = this->numberSaltsCopiedToDevice; errorCode |= clSetKernelArg (this->HashKernel, 17, sizeof(cl_mem), &this->DeviceSaltLengthsAddress); errorCode |= clSetKernelArg (this->HashKernel, 18, sizeof(cl_mem), &this->DeviceSaltValuesAddress); errorCode |= clSetKernelArg (this->HashKernel, 19, sizeof(cl_ulong), &localNumberSaltValues); if (errorCode != CL_SUCCESS) { printf("Thread %d, dev %d: OpenCL error 5: %s. Exiting.\n", this->threadId, this->gpuDeviceId, print_cl_errstring(errorCode)); exit(1); } }
void MFNHashTypePlainOpenCL::setupDevice() { trace_printf("CHHashTypeVPlainCUDA::setupDevice()\n"); char buildOptions[1024]; cl_int errorCode; // Set the OpenCL platform & device trace_printf("Thread %d setting OpenCL platform/device to %d, %d\n", this->threadId, this->openCLPlatformId, this->gpuDeviceId); this->OpenCL->selectPlatformById(this->openCLPlatformId); this->OpenCL->selectDeviceById(this->gpuDeviceId); /** * Handle generating the kernels. This involves building with the specified * password length, vector width, and BFI_INT status. */ if (MultiforcerGlobalClassFactory.getCommandlinedataClass()->GetUseBfiInt()) { // BFI_INT patching - pass BITALIGN to kernel sprintf(buildOptions, "-D PASSWORD_LENGTH=%d -D VECTOR_WIDTH=%d -D BITALIGN=1", this->passwordLength, this->VectorWidth); } else { // No BFI_INT patching. sprintf(buildOptions, "-D PASSWORD_LENGTH=%d -D VECTOR_WIDTH=%d", this->passwordLength, this->VectorWidth); } this->OpenCL->buildProgramFromManySourcesConcat(this->getHashFileNames(), buildOptions); // If the BFI_INT patching is being used, patch the generated binary. if (MultiforcerGlobalClassFactory.getCommandlinedataClass()->GetUseBfiInt()) { this->OpenCL->doAMDBFIPatch(); } this->HashProgram = this->OpenCL->getProgram(); this->HashKernel = clCreateKernel (this->HashProgram, this->getHashKernelName().c_str(), &errorCode); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } }
void MFNHashTypePlainOpenCL_MD5::copyConstantDataToDevice() { trace_printf("MFNHashTypePlainOpenCL_MD5::copyConstantDataToDevice()\n"); cl_int errorCode; // Begin copying constant data to the device. errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceForwardCharsetAddress, CL_TRUE /* blocking write */, 0 /* offset */, this->charsetForwardLookup.size() /* bytes to copy */, (void *)&this->charsetForwardLookup[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceReverseCharsetAddress, CL_TRUE /* blocking write */, 0 /* offset */, this->charsetReverseLookup.size() /* bytes to copy */, (void *)&this->charsetReverseLookup[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceCharsetLengthsAddress, CL_TRUE /* blocking write */, 0 /* offset */, this->charsetLengths.size() /* bytes to copy */, (void *)&this->charsetLengths[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap8kb_Address, CL_TRUE /* blocking write */, 0 /* offset */, 8192 /* bytes to copy */, (void *)&this->sharedBitmap8kb_a[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap16kb_Address, CL_TRUE /* blocking write */, 0 /* offset */, 16384 /* bytes to copy */, (void *)&this->sharedBitmap16kb_a[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap32kb_Address, CL_TRUE /* blocking write */, 0 /* offset */, 32768 /* bytes to copy */, (void *)&this->sharedBitmap32kb_a[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap256kb_a_Address, CL_TRUE /* blocking write */, 0 /* offset */, 256*1024 /* bytes to copy */, (void *)&this->globalBitmap256kb_a[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } // Copy the values into a variable that can be accessed as a pointer. uint64_t localNumberHashes = this->activeHashesProcessed.size(); uint64_t localNumberThreads = this->TotalKernelWidth; errorCode = 0; errorCode |= clSetKernelArg (this->HashKernel, 0, sizeof(cl_mem), &this->DeviceForwardCharsetAddress); errorCode |= clSetKernelArg (this->HashKernel, 1, sizeof(cl_mem), &this->DeviceReverseCharsetAddress); errorCode |= clSetKernelArg (this->HashKernel, 2, sizeof(cl_mem), &this->DeviceCharsetLengthsAddress); if (this->sharedBitmapSize == 8) { errorCode |= clSetKernelArg (this->HashKernel, 3, sizeof(cl_mem), &this->DeviceBitmap8kb_Address); } else if (this->sharedBitmapSize == 16) { errorCode |= clSetKernelArg (this->HashKernel, 3, sizeof(cl_mem), &this->DeviceBitmap16kb_Address); } else if (this->sharedBitmapSize == 32) { errorCode |= clSetKernelArg (this->HashKernel, 3, sizeof(cl_mem), &this->DeviceBitmap32kb_Address); } else { printf("Error: Invalid shared bitmap size! Must be 8, 16, 32\n"); exit(1); } errorCode |= clSetKernelArg (this->HashKernel, 4, sizeof(cl_ulong), &localNumberHashes); errorCode |= clSetKernelArg (this->HashKernel, 5, sizeof(cl_mem), &this->DeviceHashlistAddress); errorCode |= clSetKernelArg (this->HashKernel, 6, sizeof(cl_mem), &this->DeviceFoundPasswordsAddress); errorCode |= clSetKernelArg (this->HashKernel, 7, sizeof(cl_mem), &this->DeviceSuccessAddress); errorCode |= clSetKernelArg (this->HashKernel, 8, sizeof(cl_mem), &this->DeviceBitmap128mb_a_Address); errorCode |= clSetKernelArg (this->HashKernel, 9, sizeof(cl_mem), &this->DeviceBitmap128mb_b_Address); errorCode |= clSetKernelArg (this->HashKernel, 10, sizeof(cl_mem), &this->DeviceBitmap128mb_c_Address); errorCode |= clSetKernelArg (this->HashKernel, 11, sizeof(cl_mem), &this->DeviceBitmap128mb_d_Address); errorCode |= clSetKernelArg (this->HashKernel, 12, sizeof(cl_mem), &this->DeviceStartPointAddress); errorCode |= clSetKernelArg (this->HashKernel, 13, sizeof(cl_ulong), &localNumberThreads); errorCode |= clSetKernelArg (this->HashKernel, 15, sizeof(cl_mem), &this->DeviceStartPasswords32Address); errorCode |= clSetKernelArg (this->HashKernel, 16, sizeof(cl_mem), &this->DeviceBitmap256kb_a_Address); if (errorCode != CL_SUCCESS) { printf("Thread %d, dev %d: OpenCL error 5: %s. Exiting.\n", this->threadId, this->gpuDeviceId, print_cl_errstring(errorCode)); exit(1); } }
void getGPUUnitSupportedImageFormats(cl_context context){ cl_image_format supported_image_formats[1000]; cl_uint supported_image_format_list_size; //collect supported image formats cl_int status = clGetSupportedImageFormats( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, sizeof(supported_image_formats) / sizeof(supported_image_formats[0]), supported_image_formats, &supported_image_format_list_size); if (status != CL_SUCCESS) { printf("%s\n", print_cl_errstring(status)); exit(1); } for (int i = 0; i < supported_image_format_list_size; i++) { printf("Supported image format: "); switch (supported_image_formats[i].image_channel_order) { case CL_R: printf("CL_R"); break; case CL_A: printf("CL_A"); break; case CL_INTENSITY: printf("CL_INTENSITY"); break; case CL_LUMINANCE: printf("CL_LUMINANCE"); break; case CL_RG: printf("CL_RG"); break; case CL_RA: printf("CL_RA"); break; case CL_RGB: printf("CL_RGB"); break; case CL_RGBA: printf("CL_RGBA"); break; case CL_ARGB: printf("CL_ARGB"); break; case CL_BGRA: printf("CL_BGRA"); break; default: printf("Unknown"); break; } printf(", "); switch (supported_image_formats[i].image_channel_data_type) { case CL_UNORM_INT8: printf("CL_UNORM_INT8\n"); break; case CL_UNORM_INT16: printf("CL_UNORM_INT16\n"); break; case CL_SNORM_INT8: printf("CL_SNORM_INT8\n"); break; case CL_SNORM_INT16: printf("CL_SNORM_INT16\n"); break; case CL_HALF_FLOAT: printf("CL_HALF_FLOAT\n"); break; case CL_FLOAT: printf("CL_FLOAT\n"); break; case CL_UNORM_SHORT_565: printf("CL_UNORM_SHORT_565\n"); break; case CL_UNORM_SHORT_555: printf("CL_UNORM_SHORT_555\n"); break; case CL_UNORM_INT_101010: printf("CL_UNORM_INT_101010\n"); break; case CL_SIGNED_INT8: printf("CL_SIGNED_INT8\n"); break; case CL_UNSIGNED_INT8: printf("CL_UNSIGNED_INT8\n"); break; case CL_SIGNED_INT16: printf("CL_SIGNED_INT16\n"); break; case CL_SIGNED_INT32: printf("CL_SIGNED_INT32\n"); break; case CL_UNSIGNED_INT16: printf("CL_UNSIGNED_INT16\n"); break; case CL_UNSIGNED_INT32: printf("CL_UNSIGNED_INT32\n"); break; default: printf("Unknown\n"); break; } } }
void MFNHashTypePlainOpenCL::allocateThreadAndDeviceMemory() { trace_printf("MFNHashTypePlainOpenCL::allocateThreadAndDeviceMemory()\n"); /** * Error variable - stores the result of the various mallocs & such. */ cl_int errorCode; /* * Malloc the device hashlist space. This is the number of available hashes * times the hash length in bytes. The data will be copied later. */ memalloc_printf("Attempting to openclMalloc %d bytes for device hashlist for thread %d.\n", this->activeHashesProcessed.size() * this->hashLengthBytes, this->threadId); this->DeviceHashlistAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, this->activeHashesProcessed.size() * this->hashLengthBytes, NULL, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate %d bytes for device hashlist! Exiting!\n", this->activeHashesProcessed.size() * this->hashLengthBytes); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } /* * Allocate the host/device space for the success list (flags for found passwords). * This is a byte per password. To avoid atomic write issues, each password * gets a full addressible byte, and the GPU handles the dependencies between * multiple threads trying to set a flag in the same segment of memory. * * On the host, it will be allocated as mapped memory if we are using zerocopy. * * As this region of memory is frequently copied back to the host, mapping it * improves performance. In theory. */ memalloc_printf("Attempting to cudaHostAlloc %d bytes for HostSuccess\n", this->activeHashesProcessed.size()); this->HostSuccessAddress = new uint8_t [this->activeHashesProcessed.size()]; memset(this->HostSuccessAddress, 0, this->activeHashesProcessed.size()); // Allocate memory for the reported flags. this->HostSuccessReportedAddress = new uint8_t [this->activeHashesProcessed.size()]; memset(this->HostSuccessReportedAddress, 0, this->activeHashesProcessed.size()); // Allocate device memory for the "reported" flags, and copy in the zeroed // host memory for this region. this->DeviceSuccessAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, this->activeHashesProcessed.size(), this->HostSuccessAddress, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate %d bytes for device successlist! Exiting!\n", this->activeHashesProcessed.size()); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } /* * Allocate memory for the found passwords. As this is commonly copied * back and forth, it should be made zero copy if requested. * * This requires (number hashes * passwordLength) bytes of data. */ this->HostFoundPasswordsAddress = new uint8_t [this->passwordLength * this->activeHashesProcessed.size()]; // Clear the host found password space. memset(this->HostFoundPasswordsAddress, 0, this->passwordLength * this->activeHashesProcessed.size()); this->DeviceFoundPasswordsAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, this->passwordLength * this->activeHashesProcessed.size(), this->HostFoundPasswordsAddress, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate %d bytes for device passwordlist! Exiting!\n", this->passwordLength * this->activeHashesProcessed.size()); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } /** * Allocate space for host and device start positions. To improve performance, * this space is now aligned for improved coalescing performance. All the * position 0 elements are together, followed by all the position 1 elements, * etc. * * This memory can be allocated as write combined, as it is not read by * the host ever - only written. Since it is regularly transferred to the * GPU, this should help improve performance. */ this->HostStartPointAddress = new uint8_t [this->TotalKernelWidth * this->passwordLength]; this->DeviceStartPointAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, this->TotalKernelWidth * this->passwordLength, NULL, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate %d bytes for device start points! Exiting!\n", this->TotalKernelWidth * this->passwordLength); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } this->DeviceStartPasswords32Address = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, this->TotalKernelWidth * this->passwordLengthWords, NULL, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate %d bytes for device start passwords! Exiting!\n", this->TotalKernelWidth * this->passwordLengthWords); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } /** * Allocate memory for the things that are considered constant in CUDA * and not stored in global memory. For OpenCL, these are stored in a * constant-tagged chunk of global memory (or something) and therefore * need to have space allocated in global memory. */ this->DeviceBitmap8kb_a_Address = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, 8192, NULL, &errorCode); if (errorCode == CL_SUCCESS) { memalloc_printf("Successfully allocated 8kb Bitmap A\n"); } else { printf("Unable to allocate 8kb bitmap A\n"); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } this->DeviceForwardCharsetAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, MFN_HASH_TYPE_PLAIN_CUDA_MD5_MAX_CHARSET_LENGTH * this->passwordLength, NULL, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate forward charset\n"); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } this->DeviceReverseCharsetAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, MFN_HASH_TYPE_PLAIN_CUDA_MD5_MAX_CHARSET_LENGTH * this->passwordLength, NULL, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate reverse charset\n"); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } this->DeviceCharsetLengthsAddress = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, this->passwordLength, NULL, &errorCode); if (errorCode != CL_SUCCESS) { printf("Unable to allocate charset lengths\n"); printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } /** * Finally, attempt to allocate space for the giant device bitmaps. There * are 4x128MB bitmaps, and any number can be allocated. If they are not * fully allocated, their address is set to null as a indicator to the device * that there is no data present. Attempt to allocate as many as possible. * * This will be accessed regularly, so should probably not be zero copy. * Also, I'm not sure how mapping host memory into multiple threads would * work. Typically, if the GPU doesn't have enough RAM for the full * set of bitmaps, it's a laptop, and therefore may be short on host RAM * for the pinned access. * */ this->DeviceBitmap128mb_a_Address = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, 128 * 1024 * 1024, NULL, &errorCode); if (errorCode == CL_SUCCESS) { memalloc_printf("Successfully allocated Bitmap A\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap A\n"); this->DeviceBitmap128mb_a_Address = 0; } this->DeviceBitmap128mb_b_Address = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, 128 * 1024 * 1024, NULL, &errorCode); if (errorCode == CL_SUCCESS) { memalloc_printf("Successfully allocated Bitmap B\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap B\n"); this->DeviceBitmap128mb_b_Address = 0; } this->DeviceBitmap128mb_c_Address = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, 128 * 1024 * 1024, NULL, &errorCode); if (errorCode == CL_SUCCESS) { memalloc_printf("Successfully allocated Bitmap C\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap C\n"); this->DeviceBitmap128mb_c_Address = 0; } this->DeviceBitmap128mb_d_Address = clCreateBuffer (this->OpenCL->getContext(), CL_MEM_READ_ONLY, 128 * 1024 * 1024, NULL, &errorCode); if (errorCode == CL_SUCCESS) { memalloc_printf("Successfully allocated Bitmap D\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap D\n"); this->DeviceBitmap128mb_d_Address = 0; } memalloc_printf("Thread %d memory allocated successfully\n", this->threadId); }
void MFNHashTypePlainOpenCL::copyDataToDevice() { trace_printf("MFNHashTypePlainOpenCL::copyDataToDevice()\n"); cl_int errorCode; // Copy all the various elements of data to the device, forming them as needed. errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceHashlistAddress, CL_TRUE /* blocking write */, 0 /* offset */, this->activeHashesProcessedDeviceformat.size() /* bytes to copy */, (void *)&this->activeHashesProcessedDeviceformat[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } // Device bitmaps: Copy all relevant bitmaps to the device. // Only copy bitmaps that are created. if (this->DeviceBitmap128mb_a_Address) { memalloc_printf("Thread %d: Copying bitmap A\n", this->threadId); errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap128mb_a_Address, CL_TRUE /* blocking write */, 0 /* offset */, this->globalBitmap128mb_a.size() /* bytes to copy */, (void *)&this->globalBitmap128mb_a[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } } if (this->DeviceBitmap128mb_b_Address) { memalloc_printf("Thread %d: Copying bitmap B\n", this->threadId); errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap128mb_b_Address, CL_TRUE /* blocking write */, 0 /* offset */, this->globalBitmap128mb_b.size() /* bytes to copy */, (void *)&this->globalBitmap128mb_b[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } } if (this->DeviceBitmap128mb_c_Address) { memalloc_printf("Thread %d: Copying bitmap C\n", this->threadId); errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap128mb_c_Address, CL_TRUE /* blocking write */, 0 /* offset */, this->globalBitmap128mb_c.size() /* bytes to copy */, (void *)&this->globalBitmap128mb_c[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } } if (this->DeviceBitmap128mb_d_Address) { memalloc_printf("Thread %d: Copying bitmap D\n", this->threadId); errorCode = clEnqueueWriteBuffer (this->OpenCL->getCommandQueue(), this->DeviceBitmap128mb_d_Address, CL_TRUE /* blocking write */, 0 /* offset */, this->globalBitmap128mb_d.size() /* bytes to copy */, (void *)&this->globalBitmap128mb_d[0], NULL, NULL, NULL /* event list stuff */); if (errorCode != CL_SUCCESS) { printf("Error: %s\n", print_cl_errstring(errorCode)); exit(1); } } // Other data to the device - charset, etc. }
// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { std::cout << "Simple Image Processing Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("gaussian_filter.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // Create a command commands // if(!(commands = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum))) { std::cout << "Failed to create a command commands!" << std::endl; cleanKill(EXIT_FAILURE); } cl_kernel kernel = clCreateKernel(program, "gaussian_filter", &errNum); checkErr(errNum, "clCreateKernel(gaussian_filter)"); if(!doesGPUSupportImageObjects){ cleanKill(EXIT_FAILURE); } inputImage = LoadImage(context, (char*)"rgba.png", width, height); cl_image_format format; format.image_channel_order = CL_RGBA; format.image_channel_data_type = CL_UNORM_INT8; outputImage = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &format, width, height, 0, NULL, &errNum); if(there_was_an_error(errNum)){ std::cout << "Output Image Buffer creation error!" << std::endl; cleanKill(EXIT_FAILURE); } if (!inputImage || !outputImage ){ std::cout << "Failed to allocate device memory!" << std::endl; cleanKill(EXIT_FAILURE); } char *buffer = new char [width * height * 4]; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { width, height, 1}; sampler = clCreateSampler(context, CL_FALSE, // Non-normalized coordinates CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &errNum); if(there_was_an_error(errNum)){ std::cout << "Error creating CL sampler object." << std::endl; cleanKill(EXIT_FAILURE); } // Set the kernel arguments errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; std::cerr << print_cl_errstring(errNum) << std::endl; cleanKill(EXIT_FAILURE); } //errNum = clGetKernelWorkGroupInfo(kernel, deviceIDs, CL_KERNEL_WORK_GROUP_SIZE, sizeof(unsigned short)* height*width*4, &local, NULL); // if (errNum != CL_SUCCESS) // { // cout << print_cl_errstring(err) << endl; // if(err == CL_INVALID_VALUE){ // cout << "if param_name is not valid, or if size in bytes specified by param_value_size " // << "is less than the size of return type as described in the table above and " // << "param_value is not NULL." << endl; // } // cout << "Error: Failed to retrieve kernel work group info!" << err << endl; // cleanKill(EXIT_FAILURE); // } std::cout << "Max work group size is " << CL_DEVICE_MAX_WORK_GROUP_SIZE << std::endl; std::cout << "Max work item size is " << CL_DEVICE_MAX_WORK_ITEM_SIZES << std::endl; size_t localWorkSize[2]; size_t globalWorkSize[2]; localWorkSize[0] = 1; localWorkSize[1] = localWorkSize[0]; globalWorkSize[0] = width*height; globalWorkSize[1] = globalWorkSize[0]; //CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and number of work-items specified by global_work_size is not evenly divisable by size of work-group given by local_work_size //size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width), RoundUp(localWorkSize[1], height)}; // size_t globalWorkSize[1] = {sizeof(unsigned short)* height * width}; // size_t localWorkSize[1] = {64}; // Queue the kernel up for execution errNum = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS){ std::cerr << "Error queuing kernel for execution." << std::endl; std::cerr << print_cl_errstring(errNum) << std::endl; cleanKill(EXIT_FAILURE); } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back computed data errNum = clEnqueueReadImage(commands, outputImage, CL_TRUE, origin, region, 0, 0, buffer, 0, NULL, NULL); SaveImage((char*)"outRGBA.png", (char*)buffer, width, height); std::cout << "Program completed successfully" << std::endl; return 0; }
cl_program OclHost::setUpProgram(char const * const oclSwScore, std::string buildOptions) { //char const * additional_options_nv = " -cl-nv-verbose -cl-fast-relaxed-math"; //char * additional_options = 0; size_t program_length = strlen(oclSwScore); if (strcasestr(clPlatformName, "NVIDIA") != 0) { buildOptions += " -D __NVIDIA__ -cl-nv-verbose -cl-fast-relaxed-math"; } cl_int ciErrNum = 0; //Log.Message("Source: %s\n===========================", oclSwScore); // create the program cl_program cpProgram = clCreateProgramWithSource(oclGpuContext, 1, (const char **) &oclSwScore, &program_length, &ciErrNum); // checkClError("Unable to build program.", ciErrNum); if (ciErrNum == CL_SUCCESS) { // build the program Log.Verbose("Build Options: %s", buildOptions.c_str()); ciErrNum = clBuildProgram(cpProgram, 0, NULL, buildOptions.c_str(), NULL, NULL); if (ciErrNum != CL_SUCCESS) Log.Error("Build failed: %s", print_cl_errstring(ciErrNum)); //checkClError("Unable to build program (clBuildProgram).", ciErrNum); //clUnloadCompiler(); char cBuildLog[10240]; clGetProgramBuildInfo(cpProgram, oclDevice, CL_PROGRAM_BUILD_OPTIONS, sizeof(cBuildLog), cBuildLog, NULL); Log.Verbose("Build options: %s", cBuildLog); cl_build_status status; clGetProgramBuildInfo(cpProgram, oclDevice, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); if (status != CL_BUILD_SUCCESS) { Log.Message("Build status: %s", print_cl_buildstatus(status)); clGetProgramBuildInfo(cpProgram, oclDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL); Log.Message("Build log:"); char * pBuildLog = strtok(cBuildLog, "\n"); while (pBuildLog != NULL) { if (strlen(pBuildLog) > 1) { Log.Message("%s", pBuildLog); } pBuildLog = strtok(NULL, "\n"); } } checkClError("Unable to build program end.", ciErrNum); return cpProgram; } else { Log.Error("Unable to load OpenCl kernel source. Error: %d", ciErrNum); } return 0; }
void OclHost::checkClError(char const * msg, cl_int ciErrNum) { if (ciErrNum != CL_SUCCESS) { Log.Error("%s\nError: %s (%d)", msg, print_cl_errstring(ciErrNum), ciErrNum); throw; } }