OpenCLMomentumV3::OpenCLMomentumV3(int _HASH_BITS, int _device_num) { max_threads = 1<<30; // very big HASH_BITS = _HASH_BITS; device_num = _device_num; OpenCLMain& main = OpenCLMain::getInstance(); // checks if device exists if (main.getInstance().getNumDevices() <= device_num) { printf("ERROR: DEVICE %d does not exist. Please limit your threads to one per device.\n", device_num); assert(false); } // compiles fprintf(stdout, "Starting OpenCLMomentum V3\n"); fprintf(stdout, "Device %02d: %s\n", device_num, main.getDevice(device_num)->getName().c_str()); cl_ulong maxWorkGroupSize = main.getDevice(device_num)->getMaxWorkGroupSize(); fprintf(stdout, "Max work group size: %llu\n", maxWorkGroupSize); if (maxWorkGroupSize < max_threads) max_threads = maxWorkGroupSize; OpenCLContext *context = main.getDevice(device_num)->getContext(); std::vector<std::string> program_filenames; program_filenames.push_back("opencl/opencl_cryptsha512.h"); program_filenames.push_back("opencl/cryptsha512_kernel.cl"); program_filenames.push_back("opencl/OpenCLMomentumV3.cl"); OpenCLProgram *program = context->loadProgramFromFiles(program_filenames); // prealoc kernels OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); // only one queue, helps with memory leaking queue = context->createCommandQueue(main.getDevice(device_num)); size_t BLOCKSIZE = max_threads; // allocate internal structure cl_message = context->createBuffer(sizeof(uint8_t)*32, CL_MEM_READ_ONLY, NULL); internal_hash_table = context->createBuffer(sizeof(uint32_t)*(1<<HASH_BITS), CL_MEM_READ_WRITE, NULL); temp_collisions = context->createBuffer(sizeof(collision_struct)*getCollisionCeiling(), CL_MEM_WRITE_ONLY, NULL); temp_collisions_count = context->createBuffer(sizeof(size_t), CL_MEM_READ_WRITE, NULL); // sets args kernel_cleanup->resetArgs(); kernel_cleanup->addGlobalArg(internal_hash_table); kernel->resetArgs(); kernel->addGlobalArg(cl_message); kernel->addGlobalArg(internal_hash_table); uint32_t ht_size = 1<<HASH_BITS; kernel->addScalarUInt(ht_size); kernel->addGlobalArg(temp_collisions); kernel->addGlobalArg(temp_collisions_count); }
void OpenCLMomentumV9::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) { // temp storage *collision_count = 0; uint32_t ht_size = 1<<HASH_BITS; SHA512_Context c512_avxsse; SHA512_Init(&c512_avxsse); uint8_t midhash[32+4]; memcpy(midhash+4, message, 32); *((uint32_t*)midhash) = 0; SHA512_Update_Simple(&c512_avxsse, midhash, 32+4); SHA512_PreFinal(&c512_avxsse); *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0; uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]); for (int i = 1; i < 5; i++) { swap_helper[i] = SWAP64(swap_helper[i]); } OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); assert(kernel != NULL); //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize(); size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); //has to be a power of 2 BLOCKSIZE = 1<<log2(BLOCKSIZE); size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN); // printf("BLOCKSIZE = %ld\n", BLOCKSIZE); // printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN); // cleans up the hash table queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN); queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE); queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE); queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling()); queue->finish(); }
void OpenCLMomentumV3::find_collisions(uint8_t* message, collision_struct* collisions, size_t* collision_count) { // temp storage *collision_count = 0; OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel = program->getKernel("kernel_sha512"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); assert(kernel != NULL); //size_t BLOCKSIZE = main.getPlatform(0)->getDevice(0)->getMaxWorkGroupSize(); size_t BLOCKSIZE = kernel->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); //has to be a power of 2 BLOCKSIZE = 1<<log2(BLOCKSIZE); size_t BLOCKSIZE_CLEAN = kernel_cleanup->getWorkGroupSize(OpenCLMain::getInstance().getDevice(device_num)); BLOCKSIZE_CLEAN = 1<<log2(BLOCKSIZE_CLEAN); // printf("BLOCKSIZE = %ld\n", BLOCKSIZE); // printf("BLOCKSIZE_CLEAN = %ld\n", BLOCKSIZE_CLEAN); // cleans up the hash table queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, BLOCKSIZE_CLEAN); queue->enqueueWriteBuffer(cl_message, message, sizeof(uint8_t)*32); queue->enqueueWriteBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueKernel1D(kernel, MAX_MOMENTUM_NONCE/8, BLOCKSIZE); queue->enqueueReadBuffer(temp_collisions_count, collision_count, sizeof(size_t)); queue->enqueueReadBuffer(temp_collisions, collisions, sizeof(collision_struct)*getCollisionCeiling()); queue->finish(); }
void OpenCLMomentumV8::find_collisions(uint8_t* message, collision_struct* out_buff, size_t* out_count) { // temp storage *out_count = 0; uint32_t ht_size = 1<<HASH_BITS; SHA512_Context c512_avxsse; SHA512_Init(&c512_avxsse); uint8_t midhash[32+4]; memcpy(midhash+4, message, 32); *((uint32_t*)midhash) = 0; SHA512_Update_Simple(&c512_avxsse, midhash, 32+4); SHA512_PreFinal(&c512_avxsse); *(uint32_t *)(&c512_avxsse.buffer.bytes[0]) = 0; uint64_t * swap_helper = (uint64_t*)(&c512_avxsse.buffer.bytes[0]); for (int i = 1; i < 5; i++) { swap_helper[i] = SWAP64(swap_helper[i]); } OpenCLContext *context = OpenCLMain::getInstance().getDevice(device_num)->getContext(); OpenCLProgram *program = context->getProgram(0); OpenCLKernel *kernel_calculate_all_hashes = program->getKernel("calculate_all_hashes"); OpenCLKernel *kernel_fill_table = program->getKernel("fill_table"); OpenCLKernel *kernel_find_collisions = program->getKernel("find_collisions"); OpenCLKernel *kernel_cleanup = program->getKernel("kernel_clean_hash_table"); OpenCLDevice * device = OpenCLMain::getInstance().getDevice(device_num); // cleans up the hash table size_t kc_wgsize = kernel_cleanup->getWorkGroupSize(device); kc_wgsize = 1<<log2(kc_wgsize); queue->enqueueKernel1D(kernel_cleanup, 1<<HASH_BITS, kc_wgsize); // printf("Cleaning the HT\n"); // queue->finish(); queue->enqueueWriteBuffer(cl_message, c512_avxsse.buffer.bytes, sizeof(uint8_t)*SHA512_BLOCK_SIZE); // step 1, calculate hashes size_t kcah_wgsize = kernel_calculate_all_hashes->getWorkGroupSize(device); kcah_wgsize = 1<<log2(kcah_wgsize); queue->enqueueKernel1D(kernel_calculate_all_hashes, MAX_MOMENTUM_NONCE/8, kcah_wgsize); // uint64_t * apa = new uint64_t[MAX_MOMENTUM_NONCE]; // queue->enqueueReadBuffer(hashes, apa, sizeof(uint64_t)*MAX_MOMENTUM_NONCE); // queue->finish(); // // printf("testing hashes\n"); // uint64_t count = 0; // for (int i = 0; i < MAX_MOMENTUM_NONCE; i++) { // if (apa[i] == 0) { // count++; // printf("BAD HASH AT: %d %X\n", i, apa[i]); // } // } // printf("counted %X bad hashes\n", count); // printf("NOW REALLY TEST THEM hashes\n"); // count = 0; // for (uint32_t i = 0; i < MAX_MOMENTUM_NONCE/8; i+=8) { // sph_sha512_context c512_sph; //SPH // sph_sha512_init(&c512_sph); // sph_sha512(&c512_sph, &i, 4); // sph_sha512(&c512_sph, message, 32); // uint64_t out[8]; // sph_sha512_close(&c512_sph, out); // for (int j =0; j < 8; j++) { // if (apa[i+j] != out[j]) { // count++; // uint64_t xxx = apa[i+j]; // printf("BAD HASH AT: %d => %X != %X\n", i, apa[i+j], out[j]); // } // } // } // printf("counted %X bad hashes\n", count); // step 2, populate hashtable size_t kft_wgsize = kernel_fill_table->getWorkGroupSize(device); kft_wgsize = 1<<log2(kft_wgsize); queue->enqueueKernel1D(kernel_fill_table, MAX_MOMENTUM_NONCE, kft_wgsize); // printf("step 2, populate hashtable\n"); // queue->finish(); queue->enqueueWriteBuffer(collisions_count, out_count, sizeof(size_t)); // step 3, find collisions size_t kfc_wgsize = kernel_find_collisions->getWorkGroupSize(device); kfc_wgsize = 1<<log2(kfc_wgsize); queue->enqueueKernel1D(kernel_find_collisions, MAX_MOMENTUM_NONCE, kfc_wgsize); // printf("step 3, find collisions\n"); // queue->finish(); queue->enqueueReadBuffer(collisions_count, out_count, sizeof(size_t)); queue->enqueueReadBuffer(collisions, out_buff, sizeof(collision_struct)*getCollisionCeiling()); // printf("step 4, copy output\n"); queue->finish(); #ifdef DEBUG printf("Collision Count = %d\n", (*out_count)); #endif }
ProtoshareOpenCL::ProtoshareOpenCL(int _device_num) { this->device_num = _device_num; printf("Initializing GPU %d\n", device_num); OpenCLMain &main = OpenCLMain::getInstance(); this->device = main.getDevice(device_num); printf("======================================================================\n"); printf("Device information for: %s\n", device->getName().c_str()); device->dumpDeviceInfo(); // Makes troubleshooting easier printf("======================================================================\n"); printf("\n"); // Sanitize input parameters if (commandlineInput.wgs == 0) { this->wgs = device->getMaxWorkGroupSize(); } else { this->wgs = commandlineInput.wgs; } this->buckets_log2 = commandlineInput.buckets_log2; this->vect_type = commandlineInput.vect_type; this->bucket_size = commandlineInput.bucket_size; this->target_mem = commandlineInput.target_mem; // If bucket size unset and target memory unset, use maximum usable memory if (target_mem == 0 && bucket_size == 0) { target_mem = device->getGlobalMemSize() / 1024 / 1024; } // If set, convert target memory into a usable value for bucket_size if (target_mem > 0) { // Convert target to bytes, subtract 1 to guarantee results LESS THAN target uint32 target_mem_temp = (target_mem * 1024 * 1024); // Lazy calculation, assume large bucket_size, scale back from there bucket_size = 1024; while (bucket_size > 0 && calc_total_mem_usage(buckets_log2, bucket_size) > target_mem_temp) { bucket_size--; } // Make sure the parameter configuration is sane: if (bucket_size < 1) { printf("ERROR: Memory target of %d MB cannot be attained with 2^%d buckets!\n", target_mem, buckets_log2); printf(" Please lower the value of \"-b\" or increase the value of \"-m\".\n"); exit(0); } } // Make sure we can allocate hash_list (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE) cl_ulong required_mem = calc_hash_mem_usage(buckets_log2, bucket_size); cl_ulong available_mem = device->getMaxMemAllocSize(); if (required_mem > available_mem) { printf("ERROR: Device %d cannot allocate 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size); printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024); printf(" Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n"); exit(0); } // Make sure we can allocate nonce_map (cannot violate CL_DEVICE_MAX_MEM_ALLOC_SIZE) required_mem = calc_index_mem_usage(buckets_log2, bucket_size); available_mem = device->getMaxMemAllocSize(); if (required_mem > available_mem) { printf("ERROR: Device %d cannot allocate index of 2^%d elements!\n", device_num, buckets_log2); printf(" CL_DEVICE_MAX_MEM_ALLOC_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024); printf(" Please lower the value of \"-b\" or increase the value of \"-m\".\n"); exit(0); } // Make sure the whole thing fits in memory required_mem = calc_total_mem_usage(buckets_log2, bucket_size); available_mem = device->getGlobalMemSize(); if (required_mem > available_mem) { printf("ERROR: Device %d cannot store 2^%d buckets of %d elements!\n", device_num, buckets_log2, bucket_size); printf(" CL_DEVICE_GLOBAL_MEM_SIZE is %d MB, this configuration requires %d MB\n", available_mem / 1024 / 1024, required_mem / 1024 / 1024); printf(" Please lower the value of \"-b\" or \"-s\" or increase the value of \"-m\".\n"); exit(0); } // All clear, show the running parameters! printf("Using %d work group size\n", wgs); printf("Using vector size %d\n", vect_type); printf("Using 2^%d buckets\n", buckets_log2); printf("Using %d elements per bucket\n", bucket_size); printf("Using %d MB of memory\n", required_mem / 1024 / 1024); printf("Estimated drop percentage: %5.2f%%\n", 100 * poisson_estimate((1 << buckets_log2), MAX_MOMENTUM_NONCE, bucket_size)); printf("\n"); // Compile the OpenCL code printf("Compiling OpenCL code... this may take 3-5 minutes\n"); bool isGPU = device->isGPU(); if (!isGPU) { gpu_watchdog_max_wait *= 6; } // Effectively disable the watchdog std::stringstream params; params << " -I ./opencl/"; params << " -D DEVICE_GPU=" << (isGPU ? 1 : 0); params << " -D VECT_TYPE=" << vect_type; params << " -D LOCAL_WGS=" << wgs; params << " -D NUM_BUCKETS_LOG2=" << buckets_log2; params << " -D BUCKET_SIZE=" << bucket_size; #ifdef USE_SOURCE std::vector<std::string> file_list; file_list.push_back("opencl/momentum.cl"); OpenCLProgram* program = device->getContext()->loadProgramFromFiles(file_list, params.str()); #else std::vector<std::string> input_src; input_src.push_back(getMomentumOpenCL()); OpenCLProgram* program = device->getContext()->loadProgramFromStrings(input_src, params.str()); #endif kernel_hash = program->getKernel("hash_step"); kernel_reset = program->getKernel("reset_and_seek"); mid_hash = device->getContext()->createBuffer(32 * sizeof(cl_uint), CL_MEM_READ_ONLY, NULL); hash_list = device->getContext()->createBuffer(calc_hash_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL); index_list = device->getContext()->createBuffer(calc_index_mem_usage(buckets_log2, bucket_size), CL_MEM_READ_WRITE, NULL); nonce_a = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL); nonce_b = device->getContext()->createBuffer(256 * sizeof(cl_uint), CL_MEM_WRITE_ONLY, NULL); nonce_qty = device->getContext()->createBuffer(sizeof(cl_uint), CL_MEM_READ_WRITE, NULL); q = device->getContext()->createCommandQueue(device); }