void OpenCLImage2D :: create( OpenCLContext &ctx, cl_mem_flags usage, AlloArray *array ) { destroy(); detach(); usage = OpenCLMemoryBuffer::check_memory_flags(usage, array->data.ptr); bool at_least_2d = array->header.dimcount >= 2; size_t width = array->header.dim[0]; size_t height = at_least_2d ? array->header.dim[1] : 1; size_t rowstride = at_least_2d ? array->header.stride[1] : allo_array_size(array); cl_image_format format = OpenCLImageFormat::format_from_array(array); cl_int res = CL_SUCCESS; cl_mem mem = clCreateImage2D( ctx.get_context(), usage, &format, width, height, rowstride, array->data.ptr, &res ); if(opencl_error(res, "clCreateImage2D error creating buffer")) { return; } mMem = mem; ctx.attach_resource(this); }
void OpenCLImage2D :: create( OpenCLContext &ctx, cl_mem_flags usage, const cl_image_format *format, size_t width, size_t height, size_t rowstride, void *ptr ) { destroy(); detach(); usage = OpenCLMemoryBuffer::check_memory_flags(usage, ptr); cl_int res = CL_SUCCESS; cl_mem mem = clCreateImage2D( ctx.get_context(), usage, format, width, height, rowstride, ptr, &res ); if(opencl_error(res, "clCreateImage2D error creating buffer")) { return; } mMem = mem; ctx.attach_resource(this); }
void FftClFft:: compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, FftDirection direction ) { TIME_STFT TaskTimer tt("Fft ClFft"); unsigned n = input->getNumberOfElements().width; unsigned N = output->getNumberOfElements().width; if (-1 != direction) EXCEPTION_ASSERT( n == N ); { TIME_STFT TaskTimer tt("Computing fft(N=%u, n=%u, direction=%d)", N, n, direction); OpenCLContext *opencl = &OpenCLContext::Singleton(); cl_int fft_error; clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n, fft_error); if (fft_error != CL_SUCCESS) throw std::runtime_error("Could not create clFFT compute plan."); // Run the fft in OpenCL :) // fft kernel needs to have read/write access to output data fft_error |= clFFT_ExecuteInterleaved( opencl->getCommandQueue(), plan, 1, (clFFT_Direction)direction, OpenClMemoryStorage::ReadOnly<1>( input ).ptr(), OpenClMemoryStorage::ReadWrite<1>( output ).ptr(), 0, NULL, NULL ); if (fft_error != CL_SUCCESS) throw std::runtime_error("Bad stuff happened during FFT computation."); } }
void FftClFft:: compute( Tfr::ChunkData::Ptr input, Tfr::ChunkData::Ptr output, DataStorageSize n, FftDirection direction ) { TaskTimer tt("Stft::computeWithClFft( matrix[%d, %d], %s )", input->size().width, input->size().height, direction==FftDirection_Forward?"forward":"backward"); EXCEPTION_ASSERT( output->numberOfBytes() == input->numberOfBytes() ); const int batchSize = n.height; OpenCLContext *opencl = &OpenCLContext::Singleton(); cl_int fft_error; clFFT_Plan plan = CLFFTKernelBuffer::Singleton().getPlan(opencl->getContext(), n.width, fft_error); if(fft_error != CL_SUCCESS) throw std::runtime_error("Could not create clFFT compute plan."); { TaskTimer tt("Calculating batches"); // Run the fft in OpenCL :) fft_error |= clFFT_ExecuteInterleaved( opencl->getCommandQueue(), plan, batchSize, direction==FftDirection_Forward?clFFT_Forward:clFFT_Inverse, OpenClMemoryStorage::ReadOnly<1>( input ).ptr(), OpenClMemoryStorage::ReadWrite<1>( output ).ptr(), 0, NULL, NULL ); if(fft_error != CL_SUCCESS) throw std::runtime_error("Bad stuff happened during FFT computation."); } }
void OpenCLCommandQueue :: create(OpenCLContext &ctx, const OpenCLDevice &dev, bool ordered, bool profiling) { destroy(); detach(); cl_command_queue_properties properties = 0; if(profiling) { properties |= CL_QUEUE_PROFILING_ENABLE; } if(! ordered) { if(GET_FLAG(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, dev.get_queue_properties())) { properties |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; } else { opencl_error(USER_OPENCL_ERROR, "Device doesn't support out of order execution ... disabling"); } } cl_int res = CL_SUCCESS; cl_command_queue command_queue = clCreateCommandQueue( ctx.get_context(), dev.get_device(), properties, &res ); if(opencl_error(res, "clCreateCommandQueue error creating command queue")) { return; } mCommandQueue = command_queue; ctx.attach_resource(this); }
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) { // Create kernels. std::map<std::string, std::string> replacements; replacements["DATA_TYPE"] = trait->getDataType(); replacements["KEY_TYPE"] = trait->getKeyType(); replacements["SORT_KEY"] = trait->getSortKey(); replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0"); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); shortListKernel = cl::Kernel(program, "sortShortList"); computeRangeKernel = cl::Kernel(program, "computeRange"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions"); copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets"); sortBucketsKernel = cl::Kernel(program, "sortBuckets"); // Work out the work group sizes for various kernels. unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice())); unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()); // On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash. // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual // maximum, so including the check hurts performance. For the moment I'm going to just comment it out. // If we officially support Qualcomm in the future, we'll need to do something better. isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2) ; positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize); sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2); if (rangeKernelSize > length) rangeKernelSize = length; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; unsigned int targetBucketSize = sortKernelSize/2; unsigned int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. if (!isShortList) { dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange"); bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset"); bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement"); offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket"); buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets"); } }
void OpenCLMomentumV9::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) { // temp storage *collision_count = 0; uint32_t ht_size = 1<<HASH_BITS; SHA512_Context c512_avxsse; SHA512_Init(&c512_avxsse); uint8_t midhash[32+4]; memcpy(midhash+4, message, 32); *((uint32_t*)midhash) = 0; SHA512_Update_Simple(&c512_avxsse, midhash, 32+4); SHA512_PreFinal(&c512_avxsse); *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0; uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]); for (int i = 1; i < 5; i++) { swap_helper[i] = SWAP64(swap_helper[i]); } OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); assert(kernel != NULL); //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize(); size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); //has to be a power of 2 BLOCKSIZE = 1<<log2(BLOCKSIZE); size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN); // printf("BLOCKSIZE = %ld\n", BLOCKSIZE); // printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN); // cleans up the hash table queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN); queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE); queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE); queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling()); queue->finish(); }
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) { // Create kernels. std::map<std::string, std::string> replacements; replacements["DATA_TYPE"] = trait->getDataType(); replacements["KEY_TYPE"] = trait->getKeyType(); replacements["SORT_KEY"] = trait->getSortKey(); replacements["MIN_KEY"] = trait->getMinKey(); replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0"); cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements)); shortListKernel = cl::Kernel(program, "sortShortList"); computeRangeKernel = cl::Kernel(program, "computeRange"); assignElementsKernel = cl::Kernel(program, "assignElementsToBuckets"); computeBucketPositionsKernel = cl::Kernel(program, "computeBucketPositions"); copyToBucketsKernel = cl::Kernel(program, "copyDataToBuckets"); sortBucketsKernel = cl::Kernel(program, "sortBuckets"); // Work out the work group sizes for various kernels. unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2); isShortList = (length <= maxLocalBuffer); for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2) ; positionsKernelSize = rangeKernelSize; sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2); if (rangeKernelSize > length) rangeKernelSize = length; if (sortKernelSize > maxLocalBuffer) sortKernelSize = maxLocalBuffer; unsigned int targetBucketSize = sortKernelSize/2; unsigned int numBuckets = length/targetBucketSize; if (numBuckets < 1) numBuckets = 1; if (positionsKernelSize > numBuckets) positionsKernelSize = numBuckets; // Create workspace arrays. if (!isShortList) { dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange"); bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset"); bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement"); offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket"); buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets"); } }
OpenCLArray::OpenCLArray(OpenCLContext& context, int size, int elementSize, const std::string& name, cl_int flags) : context(context), size(size), elementSize(elementSize), name(name), ownsBuffer(true) { try { buffer = new cl::Buffer(context.getContext(), flags, size*elementSize); } catch (cl::Error err) { std::stringstream str; str<<"Error creating array "<<name<<": "<<err.what()<<" ("<<err.err()<<")"; throw OpenMMException(str.str()); } }
OpenCLMomentumV3::OpenCLMomentumV3(int _HASH_BITS, int _device_num) { max_threads = 1<<30; // very big HASH_BITS = _HASH_BITS; device_num = _device_num; OpenCLMain& main = OpenCLMain::getInstance(); // checks if device exists if (main.getInstance().getNumDevices() <= device_num) { printf("ERROR: DEVICE %d does not exist. Please limit your threads to one per device.\n", device_num); assert(false); } // compiles fprintf(stdout, "Starting OpenCLMomentum V3\n"); fprintf(stdout, "Device %02d: %s\n", device_num, main.getDevice(device_num)->getName().c_str()); cl_ulong maxWorkGroupSize = main.getDevice(device_num)->getMaxWorkGroupSize(); fprintf(stdout, "Max work group size: %llu\n", maxWorkGroupSize); if (maxWorkGroupSize < max_threads) max_threads = maxWorkGroupSize; OpenCLContext *context = main.getDevice(device_num)->getContext(); std::vector<std::string> program_filenames; program_filenames.push_back("opencl/opencl_cryptsha512.h"); program_filenames.push_back("opencl/cryptsha512_kernel.cl"); program_filenames.push_back("opencl/OpenCLMomentumV3.cl"); OpenCLProgram *program = context->loadProgramFromFiles(program_filenames); // prealoc kernels OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); // only one queue, helps with memory leaking queue = context->createCommandQueue(main.getDevice(device_num)); size_t BLOCKSIZE = max_threads; // allocate internal structure cl_message = context->createBuffer(sizeof(uint8_t)*32, CL_MEM_READ_ONLY, NULL); internal_hash_table = context->createBuffer(sizeof(uint32_t)*(1<<HASH_BITS), CL_MEM_READ_WRITE, NULL); temp_collisions = context->createBuffer(sizeof(collision_struct)*getCollisionCeiling(), CL_MEM_WRITE_ONLY, NULL); temp_collisions_count = context->createBuffer(sizeof(size_t), CL_MEM_READ_WRITE, NULL); // sets args kernel_cleanup->resetArgs(); kernel_cleanup->addGlobalArg(internal_hash_table); kernel->resetArgs(); kernel->addGlobalArg(cl_message); kernel->addGlobalArg(internal_hash_table); uint32_t ht_size = 1<<HASH_BITS; kernel->addScalarUInt(ht_size); kernel->addGlobalArg(temp_collisions); kernel->addGlobalArg(temp_collisions_count); }
void OpenCLMomentumV3::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) { // temp storage *collision_count = 0; OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); assert(kernel != NULL); //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize(); size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); //has to be a power of 2 BLOCKSIZE = 1<<log2(BLOCKSIZE); size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN); // printf("BLOCKSIZE = %ld\n", BLOCKSIZE); // printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN); // cleans up the hash table queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN); queue->enqueueWriteBuffer(cl_message, message, sizeof(uint8_t)*32); queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE); queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling()); queue->finish(); }
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true), numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL), oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) { // Decide how many thread blocks and force buffers to use. deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); if (deviceIsCpu) { numForceThreadBlocks = context.getNumThreadBlocks(); forceThreadBlockSize = 1; numForceBuffers = numForceThreadBlocks; } else if (context.getSIMDWidth() == 32) { if (context.getSupports64BitGlobalAtomics()) { numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); forceThreadBlockSize = 256; // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels. numForceBuffers = 1; } else { numForceThreadBlocks = 3*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); forceThreadBlockSize = 256; numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize; } } else { numForceThreadBlocks = context.getNumThreadBlocks(); forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32); if (context.getSupports64BitGlobalAtomics()) { // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels. numForceBuffers = 1; } else { numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize; } } }
void OpenCLMomentumV8::find_collisions(uint8_t* message, collision_struct* out_buff, size_t* out_count) { // temp storage *out_count = 0; uint32_t ht_size = 1<<HASH_BITS; SHA512_Context c512_avxsse; SHA512_Init(&c512_avxsse); uint8_t midhash[32+4]; memcpy(midhash+4, message, 32); *((uint32_t*)midhash) = 0; SHA512_Update_Simple(&c512_avxsse, midhash, 32+4); SHA512_PreFinal(&c512_avxsse); *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0; uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]); for (int i = 1; i < 5; i++) { swap_helper[i] = SWAP64(swap_helper[i]); } OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel_calculate_all_hashes = program->getKernel("calculate_all_hashes"); OpenCLKernel *kernel_fill_table = program->getKernel("fill_table"); OpenCLKernel *kernel_find_collisions = program->getKernel("find_collisions"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); OpenCLDevice * device = OpenCLMain::getInstance().getDevice(device_num); // cleans up the hash table size_t kc_wgsize = kernel_cleanup->getWorkGroupSize(device); kc_wgsize = 1<<log2(kc_wgsize); queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, kc_wgsize); // printf("Cleaning the HT\n"); // queue->finish(); queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE); // step 1, calculate hashes size_t kcah_wgsize = kernel_calculate_all_hashes->getWorkGroupSize(device); kcah_wgsize = 1<<log2(kcah_wgsize); queue->enqueueKernel1D(kernel_calculate_all_hashes, MAX_MOMENTUM_NONCE/8, kcah_wgsize); // uint64_t * apa = new uint64_t[MAX_MOMENTUM_NONCE]; // queue->enqueueReadBuffer(hashes, apa, sizeof(uint64_t)*MAX_MOMENTUM_NONCE); // queue->finish(); // // printf("testing hashes\n"); // uint64_t count = 0; // for (int i = 0; i < MAX_MOMENTUM_NONCE; i++) { // if (apa[i] == 0) { // count++; // printf("BAD HASH AT: %d %X\n", i, apa[i]); // } // } // printf("counted %X bad hashes\n", count); // printf("NOW REALLY TEST THEM hashes\n"); // count = 0; // for (uint32_t i = 0; i < MAX_MOMENTUM_NONCE/8; i+=8) { // sph_sha512_context c512_sph; //SPH // sph_sha512_init(&c512_sph); // sph_sha512(&c512_sph, &i, 4); // sph_sha512(&c512_sph, message, 32); // uint64_t out[8]; // sph_sha512_close(&c512_sph, out); // for (int j =0; j < 8; j++) { // if (apa[i+j] != out[j]) { // count++; // uint64_t xxx = apa[i+j]; // printf("BAD HASH AT: %d => %X != %X\n", i, apa[i+j], out[j]); // } // } // } // printf("counted %X bad hashes\n", count); // step 2, populate hashtable size_t kft_wgsize = kernel_fill_table->getWorkGroupSize(device); kft_wgsize = 1<<log2(kft_wgsize); queue->enqueueKernel1D(kernel_fill_table, MAX_MOMENTUM_NONCE, kft_wgsize); // printf("step 2, populate hashtable\n"); // queue->finish(); queue->enqueueWriteBuffer(collisions_count, out_count, sizeof(size_t)); // step 3, find collisions size_t kfc_wgsize = kernel_find_collisions->getWorkGroupSize(device); kfc_wgsize = 1<<log2(kfc_wgsize); queue->enqueueKernel1D(kernel_find_collisions, MAX_MOMENTUM_NONCE, kfc_wgsize); // printf("step 3, find collisions\n"); // queue->finish(); queue->enqueueReadBuffer(collisions_count, out_count, sizeof(size_t)); queue->enqueueReadBuffer(collisions, out_buff, sizeof(collision_struct)*getCollisionCeiling()); // printf("step 4, copy output\n"); queue->finish(); #ifdef DEBUG printf("Collision Count = %d\n", (*out_count)); #endif }
static void setInvPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) { if (cl.getUseDoublePrecision()) kernel.setArg<mm_double4>(index, cl.getInvPeriodicBoxSizeDouble()); else kernel.setArg<mm_float4>(index, cl.getInvPeriodicBoxSize()); }
OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context), dgBlockCounts(NULL) { dgBlockCounts = OpenCLArray::create<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts"); cl::Program program = context.createProgram(OpenCLKernelSources::compact); countKernel = cl::Kernel(program, "countElts"); moveValidKernel = cl::Kernel(program, "moveValidElementsStaged"); }
static void setPeriodicBoxArgs(OpenCLContext& cl, cl::Kernel& kernel, int index) { if (cl.getUseDoublePrecision()) { kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxSizeDouble()); kernel.setArg<mm_double4>(index++, cl.getInvPeriodicBoxSizeDouble()); kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxVecXDouble()); kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxVecYDouble()); kernel.setArg<mm_double4>(index, cl.getPeriodicBoxVecZDouble()); } else { kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxSize()); kernel.setArg<mm_float4>(index++, cl.getInvPeriodicBoxSize()); kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxVecX()); kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxVecY()); kernel.setArg<mm_float4>(index, cl.getPeriodicBoxVecZ()); } }