OpenCLMomentumV3::OpenCLMomentumV3(int _HASH_BITS, int _device_num) { max_threads = 1<<30; // very big HASH_BITS = _HASH_BITS; device_num = _device_num; OpenCLMain& main = OpenCLMain::getInstance(); // checks if device exists if (main.getInstance().getNumDevices() <= device_num) { printf("ERROR: DEVICE %d does not exist. Please limit your threads to one per device.\n", device_num); assert(false); } // compiles fprintf(stdout, "Starting OpenCLMomentum V3\n"); fprintf(stdout, "Device %02d: %s\n", device_num, main.getDevice(device_num)->getName().c_str()); cl_ulong maxWorkGroupSize = main.getDevice(device_num)->getMaxWorkGroupSize(); fprintf(stdout, "Max work group size: %llu\n", maxWorkGroupSize); if (maxWorkGroupSize < max_threads) max_threads = maxWorkGroupSize; OpenCLContext *context = main.getDevice(device_num)->getContext(); std::vector<std::string> program_filenames; program_filenames.push_back("opencl/opencl_cryptsha512.h"); program_filenames.push_back("opencl/cryptsha512_kernel.cl"); program_filenames.push_back("opencl/OpenCLMomentumV3.cl"); OpenCLProgram *program = context->loadProgramFromFiles(program_filenames); // prealoc kernels OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); // only one queue, helps with memory leaking queue = context->createCommandQueue(main.getDevice(device_num)); size_t BLOCKSIZE = max_threads; // allocate internal structure cl_message = context->createBuffer(sizeof(uint8_t)*32, CL_MEM_READ_ONLY, NULL); internal_hash_table = context->createBuffer(sizeof(uint32_t)*(1<<HASH_BITS), CL_MEM_READ_WRITE, NULL); temp_collisions = context->createBuffer(sizeof(collision_struct)*getCollisionCeiling(), CL_MEM_WRITE_ONLY, NULL); temp_collisions_count = context->createBuffer(sizeof(size_t), CL_MEM_READ_WRITE, NULL); // sets args kernel_cleanup->resetArgs(); kernel_cleanup->addGlobalArg(internal_hash_table); kernel->resetArgs(); kernel->addGlobalArg(cl_message); kernel->addGlobalArg(internal_hash_table); uint32_t ht_size = 1<<HASH_BITS; kernel->addScalarUInt(ht_size); kernel->addGlobalArg(temp_collisions); kernel->addGlobalArg(temp_collisions_count); }
void OpenCLMomentumV9::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) { // temp storage *collision_count = 0; uint32_t ht_size = 1<<HASH_BITS; SHA512_Context c512_avxsse; SHA512_Init(&c512_avxsse); uint8_t midhash[32+4]; memcpy(midhash+4, message, 32); *((uint32_t*)midhash) = 0; SHA512_Update_Simple(&c512_avxsse, midhash, 32+4); SHA512_PreFinal(&c512_avxsse); *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0; uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]); for (int i = 1; i < 5; i++) { swap_helper[i] = SWAP64(swap_helper[i]); } OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); assert(kernel != NULL); //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize(); size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); //has to be a power of 2 BLOCKSIZE = 1<<log2(BLOCKSIZE); size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN); // printf("BLOCKSIZE = %ld\n", BLOCKSIZE); // printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN); // cleans up the hash table queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN); queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE); queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE); queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling()); queue->finish(); }
virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/, vector<OpenCLProgram*> &programs) { path_trace_program.add_kernel(ustring("path_trace")); programs.push_back(&path_trace_program); return true; }
~OpenCLDeviceSplitKernel() { task_pool.stop(); /* Release kernels */ program_data_init.release(); delete split_kernel; }
virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { bool single_program = OpenCLInfo::use_single_program(); program_data_init = OpenCLDeviceBase::OpenCLProgram(this, single_program ? "split" : "split_data_init", single_program ? "kernel_split.cl" : "kernel_data_init.cl", get_build_options(this, requested_features)); program_data_init.add_kernel(ustring("path_trace_data_init")); programs.push_back(&program_data_init); program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this, single_program ? "split" : "split_state_buffer_size", single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl", get_build_options(this, requested_features)); program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); programs.push_back(&program_state_buffer_size); return split_kernel->load_kernels(requested_features); }
void OpenCLMomentumV3::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) { // temp storage *collision_count = 0; OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); assert(kernel != NULL); //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize(); size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); //has to be a power of 2 BLOCKSIZE = 1<<log2(BLOCKSIZE); size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN); // printf("BLOCKSIZE = %ld\n", BLOCKSIZE); // printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN); // cleans up the hash table queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN); queue->enqueueWriteBuffer(cl_message, message, sizeof(uint8_t)*32); queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE); queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling()); queue->finish(); }
void OpenCLMomentumV8::find_collisions(uint8_t* message, collision_struct* out_buff, size_t* out_count) { // temp storage *out_count = 0; uint32_t ht_size = 1<<HASH_BITS; SHA512_Context c512_avxsse; SHA512_Init(&c512_avxsse); uint8_t midhash[32+4]; memcpy(midhash+4, message, 32); *((uint32_t*)midhash) = 0; SHA512_Update_Simple(&c512_avxsse, midhash, 32+4); SHA512_PreFinal(&c512_avxsse); *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0; uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]); for (int i = 1; i < 5; i++) { swap_helper[i] = SWAP64(swap_helper[i]); } OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel_calculate_all_hashes = program->getKernel("calculate_all_hashes"); OpenCLKernel *kernel_fill_table = program->getKernel("fill_table"); OpenCLKernel *kernel_find_collisions = program->getKernel("find_collisions"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); OpenCLDevice * device = OpenCLMain::getInstance().getDevice(device_num); // cleans up the hash table size_t kc_wgsize = kernel_cleanup->getWorkGroupSize(device); kc_wgsize = 1<<log2(kc_wgsize); queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, kc_wgsize); // printf("Cleaning the HT\n"); // queue->finish(); queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE); // step 1, calculate hashes size_t kcah_wgsize = kernel_calculate_all_hashes->getWorkGroupSize(device); kcah_wgsize = 1<<log2(kcah_wgsize); queue->enqueueKernel1D(kernel_calculate_all_hashes, MAX_MOMENTUM_NONCE/8, kcah_wgsize); // uint64_t * apa = new uint64_t[MAX_MOMENTUM_NONCE]; // queue->enqueueReadBuffer(hashes, apa, sizeof(uint64_t)*MAX_MOMENTUM_NONCE); // queue->finish(); // // printf("testing hashes\n"); // uint64_t count = 0; // for (int i = 0; i < MAX_MOMENTUM_NONCE; i++) { // if (apa[i] == 0) { // count++; // printf("BAD HASH AT: %d %X\n", i, apa[i]); // } // } // printf("counted %X bad hashes\n", count); // printf("NOW REALLY TEST THEM hashes\n"); // count = 0; // for (uint32_t i = 0; i < MAX_MOMENTUM_NONCE/8; i+=8) { // sph_sha512_context c512_sph; //SPH // sph_sha512_init(&c512_sph); // sph_sha512(&c512_sph, &i, 4); // sph_sha512(&c512_sph, message, 32); // uint64_t out[8]; // sph_sha512_close(&c512_sph, out); // for (int j =0; j < 8; j++) { // if (apa[i+j] != out[j]) { // count++; // uint64_t xxx = apa[i+j]; // printf("BAD HASH AT: %d => %X != %X\n", i, apa[i+j], out[j]); // } // } // } // printf("counted %X bad hashes\n", count); // step 2, populate hashtable size_t kft_wgsize = kernel_fill_table->getWorkGroupSize(device); kft_wgsize = 1<<log2(kft_wgsize); queue->enqueueKernel1D(kernel_fill_table, MAX_MOMENTUM_NONCE, kft_wgsize); // printf("step 2, populate hashtable\n"); // queue->finish(); queue->enqueueWriteBuffer(collisions_count, out_count, sizeof(size_t)); // step 3, find collisions size_t kfc_wgsize = kernel_find_collisions->getWorkGroupSize(device); kfc_wgsize = 1<<log2(kfc_wgsize); queue->enqueueKernel1D(kernel_find_collisions, MAX_MOMENTUM_NONCE, kfc_wgsize); // printf("step 3, find collisions\n"); // queue->finish(); queue->enqueueReadBuffer(collisions_count, out_count, sizeof(size_t)); queue->enqueueReadBuffer(collisions, out_buff, sizeof(collision_struct)*getCollisionCeiling()); // printf("step 4, copy output\n"); queue->finish(); #ifdef DEBUG printf("Collision Count = %d\n", (*out_count)); #endif }
int main() { if(OpenCLRuntime::Initialize() != SICKL_SUCCESS) { printf("Could not OpenCL Context\n"); return -1; } Mandelbrot mbrot; mbrot.Parse(); mbrot.GetRoot().Print(); OpenCLProgram program; OpenCLCompiler::Build(mbrot, program); #if 0 // init GLEW/GLUT and other gl setup if(!OpenGLRuntime::Initialize()) { printf("Could not create OpenGL Context\n"); return -1; } OpenGLCompiler comp; Mandelbrot mbrot; mbrot.Parse(); /// Prints the AST generated from the Mandelbrot source mbrot.GetRoot().Print(); /// Compile our OpenGL program OpenGLProgram* program = comp.Build(mbrot); /// Print the generated GLSL source printf("%s\n", program->GetSource().c_str()); const uint32_t width = 350 * 5; const uint32_t height = 200 * 5; const uint32_t colors = mbrot.max_iterations; /// Generate the color table (a nice gold) float* color_map_data = new float[3 * colors]; for(uint32_t i = 0; i < colors; i++) { float x = i/(float)colors; color_map_data[3 * i + 0] = 191.0f / 255.0f * (1.0f - x); color_map_data[3 * i + 1] = 125.0f / 255.0f * (1.0f - x); color_map_data[3 * i + 2] = 37.0f / 255.0f * (1.0f - x); } /// put it int a 1d buffer OpenGLBuffer1D color_map(colors, ReturnType::Float3, color_map_data); /// our output buffer OpenGLBuffer2D result(width, height, ReturnType::Float3, nullptr); OpenGLBuffer2D copy(width, height, ReturnType::Float3, nullptr); /// initialize our program program->Initialize(width, height); /// get our binding locations for each of the program input and outputs input_t min_loc = program->GetInputHandle("min"); input_t max_loc = program->GetInputHandle("max"); input_t color_map_loc = program->GetInputHandle("color_map"); output_t output_loc = program->GetOutputHandle("output"); /// sets min values program->SetInput(min_loc, -2.5f, -1.0f); /// sets max values program->SetInput(max_loc, 1.0f, 1.0f); /// set the scaler program->SetInput(color_map_loc, color_map); /// sets the render location program->BindOutput(output_loc, result); /// Runs the program program->Run(); /// We can copy our data to the second buffer copy.SetData(result); float* result_buffer = nullptr; /// We can either read result back from the texture copy.GetData(result_buffer); /// Or from the framebuffer (which is faster on nvidia hardware at least) program->GetOutput(output_loc, result_buffer); /// Finally, dump the image to a Bitmap to view BMP image; image.SetSize(width, height); for(uint32_t i = 0; i < height; i++) { for(uint32_t j = 0; j < width; j++) { float red = result_buffer[i * width * 3 + j * 3 + 0]; float green = result_buffer[i * width * 3 + j * 3 + 1]; float blue = result_buffer[i * width * 3 + j * 3 + 2]; auto pixel = image(j,i); pixel->Red = (uint8_t)(red * 255); pixel->Green = (uint8_t)(green * 255); pixel->Blue = (uint8_t)(blue * 255); } } image.WriteToFile("result.bmp"); /// Cleanup free(result_buffer); delete program; OpenGLRuntime::Finalize(); #endif }
ProtoshareOpenCL::ProtoshareOpenCL(int _device_num) { this->device_num = _device_num; printf("Initializing GPU %d\n", device_num); OpenCLMain &main = OpenCLMain::getInstance(); this->device = main.getDevice(device_num); printf("======================================================================\n"); printf("Device information for: %s\n", device->getName().c_str()); device->dumpDeviceInfo(); // Makes troubleshooting easier printf("======================================================================\n"); printf("\n"); // Sanitize input parameters if (commandlineInput.wgs == 0) { this->wgs = device->getMaxWorkGroupSize(); } else { this->wgs = commandlineInput.wgs; } this->buckets_log2 = commandlineInput.buckets_log2; this->vect_type = commandlineInput.vect_type; this->bucket_size = commandlineInput.bucket_size; this->target_mem = commandlineInput.target_mem; // If bucket size unset and target memory unset, use maximum usable memory if (target_mem == 0 && bucket_size == 0) { target_mem = device->getGlobalMemSize() / 1024 / 1024; } // If set, convert target memory into a usable value for bucket_size if (target_mem > 0) { // Convert target to bytes, subtract 1 to guarantee results LESS THAN target uint32 target_mem_temp = (target_mem * 1024 * 1024); // Lazy calculation, assume large bucket_size, scale back from there bucket_size = 1024; while (bucket_size > 0 && calc_total_mem_usage(buckets_log2, bucket_size) > target_mem_temp) { bucket_size--; } // Make sure the parameter configuration is sane: if (bucket_size < 1) { printf("ERROR: Memory target of %d MB cannot be attained with 2^%d buckets!\n", target_mem, buckets_log2); printf(" Please lower the value of \"-b\" or increase the value of \"-m\".\n"); exit(0); } } // Make sure we can allocate hash_list (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE) cl_ulong required_mem = calc_hash_mem_usage(buckets_log2, bucket_size); cl_ulong available_mem = device->getMaxMemAllocSize(); if (required_mem > available_mem) { printf("ERROR: Device %d cannot allocate 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size); printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024); printf(" Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n"); exit(0); } // Make sure we can allocate nonce_map (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE) required_mem = calc_index_mem_usage(buckets_log2, bucket_size); available_mem = device->getMaxMemAllocSize(); if (required_mem > available_mem) { printf("ERROR: Device %d cannot allocate index of 2^%d elements!\n", device_num, buckets_log2); printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024); printf(" Please lower the value of \"-b\" or increase the value of \"-m\".\n"); exit(0); } // Make sure the whole thing fits in memory required_mem = calc_total_mem_usage(buckets_log2, bucket_size); available_mem = device->getGlobalMemSize(); if (required_mem > available_mem) { printf("ERROR: Device %d cannot store 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size); printf(" CL_DEVICE_GLOBAL_MEM_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024); printf(" Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n"); exit(0); } // All clear, show the running parameters! printf("Using %d work group size\n", wgs); printf("Using vector size %d\n", vect_type); printf("Using 2^%d buckets\n", buckets_log2); printf("Using %d elements per bucket\n", bucket_size); printf("Using %d MB of memory\n", required_mem / 1024 / 1024); printf("Estimated drop percentage: %5.2f%%\n", 100 * poisson_estimate((1 << buckets_log2), MAX_MOMENTUM_NONCE, bucket_size)); printf("\n"); // Compile the OpenCL code printf("Compiling OpenCL code... this may take 3-5 minutes\n"); bool isGPU = device->isGPU(); if (!isGPU) { gpu_watchdog_max_wait *= 6; } // Effectively disable the watchdog std::stringstream params; params << " -I ./opencl/"; params << " -D DEVICE_GPU=" << (isGPU ? 1 : 0); params << " -D VECT_TYPE=" << vect_type; params << " -D LOCAL_WGS=" << wgs; params << " -D NUM_BUCKETS_LOG2=" << buckets_log2; params << " -D BUCKET_SIZE=" << bucket_size; #ifdef USE_SOURCE std::vector<std::string> file_list; file_list.push_back("opencl/momentum.cl"); OpenCLProgram* program = device->getContext()->loadProgramFromFiles(file_list, params.str()); #else std::vector<std::string> input_src; input_src.push_back(getMomentumOpenCL()); OpenCLProgram* program = device->getContext()->loadProgramFromStrings(input_src, params.str()); #endif kernel_hash = program->getKernel("hash_step"); kernel_reset = program->getKernel("reset_and_seek"); mid_hash = device->getContext()->createBuffer(32 * sizeof(cl_uint), CL_MEM_READ_ONLY, NULL); hash_list = device->getContext()->createBuffer(calc_hash_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL); index_list = device->getContext()->createBuffer(calc_index_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL); nonce_a = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL); nonce_b = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL); nonce_qty = device->getContext()->createBuffer(sizeof(cl_uint), CL_MEM_READ_WRITE, NULL); q = device->getContext()->createCommandQueue(device); }
~OpenCLDeviceMegaKernel() { task_pool.stop(); path_trace_program.release(); }
~OpenCLDeviceSplitKernel() { task_pool.stop(); /* Release kernels */ program_data_init.release(); program_scene_intersect.release(); program_lamp_emission.release(); program_queue_enqueue.release(); program_background_buffer_update.release(); program_shader_eval.release(); program_holdout_emission_blurring_pathtermination_ao.release(); program_direct_lighting.release(); program_shadow_blocked.release(); program_next_iteration_setup.release(); program_sum_all_radiance.release(); /* Release global memory */ release_mem_object_safe(rng_coop); release_mem_object_safe(throughput_coop); release_mem_object_safe(L_transparent_coop); release_mem_object_safe(PathRadiance_coop); release_mem_object_safe(Ray_coop); release_mem_object_safe(PathState_coop); release_mem_object_safe(Intersection_coop); release_mem_object_safe(kgbuffer); release_mem_object_safe(sd); release_mem_object_safe(sd_DL_shadow); release_mem_object_safe(ray_state); release_mem_object_safe(AOAlpha_coop); release_mem_object_safe(AOBSDF_coop); release_mem_object_safe(AOLightRay_coop); release_mem_object_safe(BSDFEval_coop); release_mem_object_safe(ISLamp_coop); release_mem_object_safe(LightRay_coop); release_mem_object_safe(Intersection_coop_shadow); #ifdef WITH_CYCLES_DEBUG release_mem_object_safe(debugdata_coop); #endif release_mem_object_safe(use_queues_flag); release_mem_object_safe(Queue_data); release_mem_object_safe(Queue_index); release_mem_object_safe(work_array); #ifdef __WORK_STEALING__ release_mem_object_safe(work_pool_wgs); #endif release_mem_object_safe(per_sample_output_buffers); if(hostRayStateArray != NULL) { free(hostRayStateArray); } }