static void find_best_gws(int jtrUniqDevNo, struct fmt_main *fmt) { long int gds_size, device_gds_size; static long double total_exec_time_inv; device_gds_size = (long int)globalObj[jtrUniqDevNo].exec_time_inv * 163840; if (device_gds_size * sizeof(temp_buf) > get_max_mem_alloc_size(jtrUniqDevNo)) { device_gds_size = ((get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf)) / 8192) * 8192; gds_size = (long int)(total_exec_time_inv * 163840) + device_gds_size; gds_size = (gds_size / 8192 - 1 ) * 8192; total_exec_time_inv += globalObj[jtrUniqDevNo].exec_time_inv; } else { total_exec_time_inv += globalObj[jtrUniqDevNo].exec_time_inv; gds_size = (long int)(total_exec_time_inv * 163840); gds_size = (gds_size / 8192 + 1 ) * 8192; } gds_size = (gds_size < (MAX_KEYS_PER_CRYPT - 8192)) ? gds_size : (MAX_KEYS_PER_CRYPT - 8192); gds_size = (gds_size > 8192) ? gds_size : 8192; if (options.verbosity > 2) fprintf(stderr, "Optimal Global Work Size:%ld\n", gds_size); fmt -> params.max_keys_per_crypt = gds_size; fmt -> params.min_keys_per_crypt = max_lws(); }
static void find_best_gws(int do_benchmark, struct fmt_main *self) { int num; cl_ulong run_time, min_time = CL_ULONG_MAX; unsigned int SHAspeed, bestSHAspeed = 0, max_gws; int optimal_gws = local_work_size; const int sha1perkey = 50004; unsigned long long int MaxRunTime = 5000000000ULL; max_gws = get_max_mem_alloc_size(ocl_gpu_id) / (UNICODE_LENGTH * VF); if (do_benchmark) { fprintf(stderr, "Calculating best keys per crypt (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, MaxRunTime / 1000000000UL); fprintf(stderr, "Raw GPU speed figures including buffer transfers:\n"); } for (num = local_work_size; max_gws; num *= 2) { if (!do_benchmark) advance_cursor(); if (!(run_time = gws_test(num, do_benchmark, self))) break; SHAspeed = sha1perkey * (1000000000UL * VF * num / run_time); if (run_time < min_time) min_time = run_time; if (do_benchmark) fprintf(stderr, "gws %6d%8llu c/s%14u sha1/s%8.3f sec per crypt_all()", num, (1000000000ULL * VF * num / run_time), SHAspeed, (float)run_time / 1000000000.); if (((float)run_time / (float)min_time) < ((float)SHAspeed / (float)bestSHAspeed)) { if (do_benchmark) fprintf(stderr, "!\n"); bestSHAspeed = SHAspeed; optimal_gws = num; } else { if (run_time < MaxRunTime && SHAspeed > (bestSHAspeed * 1.01)) { if (do_benchmark) fprintf(stderr, "+\n"); bestSHAspeed = SHAspeed; optimal_gws = num; continue; } if (do_benchmark) fprintf(stderr, "\n"); if (run_time >= MaxRunTime) break; } } global_work_size = optimal_gws; }
/* -- This function could be used to calculated the best num of keys per crypt for the given format -- */ static void find_best_gws(struct fmt_main * self) { size_t num = 0; cl_ulong run_time, min_time = CL_ULONG_MAX; int optimal_gws = local_work_size, step = STEP; int do_benchmark = 0; unsigned int SHAspeed, bestSHAspeed = 0; unsigned long long int max_run_time = 1000000000ULL; char *tmp_value; if ((tmp_value = getenv("STEP"))){ step = atoi(tmp_value); do_benchmark = 1; } step = get_multiple(step, local_work_size); if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, DUR_CONFIG))) max_run_time = atoi(tmp_value) * 1000000000UL; fprintf(stderr, "Calculating best global work size (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, max_run_time / 1000000000ULL); if (do_benchmark) fprintf(stderr, "Raw speed figures including buffer transfers:\n"); for (num = get_step(num, step, 1); num; num = get_step(num, step, 0)) { //Check if hardware can handle the size we are going to try now. if (sizeof(sha256_password) * num * 1.2 > get_max_mem_alloc_size(ocl_gpu_id)) break; if (! (run_time = gws_test(num, self))) continue; if (!do_benchmark) advance_cursor(); SHAspeed = num / (run_time / 1000000000.); if (run_time < min_time) min_time = run_time; if (do_benchmark) { fprintf(stderr, "gws: %8zu\t%12lu c/s %8.3f ms per crypt_all()", num, (long) (num / (run_time / 1000000000.)), (float) run_time / 1000000.); if (run_time > max_run_time) { fprintf(stderr, " - too slow\n"); break; } } else { if (run_time > min_time * 20 || run_time > max_run_time) break; } if (((long) SHAspeed - bestSHAspeed) > 10000) { if (do_benchmark) fprintf(stderr, "+"); bestSHAspeed = SHAspeed; optimal_gws = num; } if (do_benchmark) fprintf(stderr, "\n"); } fprintf(stderr, "Optimal global work size %d\n", optimal_gws); fprintf(stderr, "(to avoid this test on next run, put \"" GWS_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS SUBSECTION_OPENCL "])\n", optimal_gws); global_work_size = optimal_gws; create_clobj(optimal_gws, self); }
static void find_best_workgroup(int jtrUniqDevNo, unsigned int gpu_perf) { size_t _lws=0; cl_device_type dTyp; cl_command_queue cmdq; cl_int err; unsigned int max_kpc = get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf) < MAX_KEYS_PER_CRYPT ? ((get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf)) / 8192 - 1) * 8192 : MAX_KEYS_PER_CRYPT; cl_uint *dcc_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); cl_uint *dcc2_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); cl_uint *hmac_sha1_out = (cl_uint*)mem_alloc(5 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); cl_uint salt_api[9], length = 10; event_ctr = 0; //HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info"); dTyp = get_device_type(jtrUniqDevNo); if (dTyp == CL_DEVICE_TYPE_CPU) globalObj[jtrUniqDevNo].lws = 1; else globalObj[jtrUniqDevNo].lws = 16; ///Set Dummy DCC hash , unicode salt and ascii salt(username) length memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); memset(salt_api, 0xfe, 9 * sizeof(cl_uint)); cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_CLERROR(err, "Error creating command queue"); PROFILE = 1; kernelExecTimeNs = CL_ULONG_MAX; ///Find best local work size while (1) { _lws = globalObj[jtrUniqDevNo].lws; if (dTyp == CL_DEVICE_TYPE_CPU) exec_pbkdf2(dcc_hash_host, salt_api, length, 10240, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq, hmac_sha1_out); else exec_pbkdf2(dcc_hash_host, salt_api, length, 10240, dcc2_hash_host, (((max_kpc < 65536) ? max_kpc : 65536) / gpu_perf), jtrUniqDevNo, cmdq, hmac_sha1_out); if (globalObj[jtrUniqDevNo].lws <= _lws) break; } if (dTyp == CL_DEVICE_TYPE_CPU) globalObj[jtrUniqDevNo].exec_time_inv = globalObj[jtrUniqDevNo].exec_time_inv / 16; else globalObj[jtrUniqDevNo].exec_time_inv *= (((max_kpc < 65536) ? max_kpc : 65536) / (long double) gpu_perf) / 65536; PROFILE = 0; if (options.verbosity > 2) { fprintf(stderr, "Optimal Work Group Size:%d\n", (int)globalObj[jtrUniqDevNo].lws); fprintf(stderr, "Kernel Execution Speed (Higher is better):%Lf\n", globalObj[jtrUniqDevNo].exec_time_inv); } MEM_FREE(dcc_hash_host); MEM_FREE(dcc2_hash_host); MEM_FREE(hmac_sha1_out); HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed"); }
size_t select_device(int jtrUniqDevNo, struct fmt_main *fmt) { cl_int err; const char *errMsg; size_t memAllocSz; active_dev_ctr++; opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevNo, NULL); globalObj[jtrUniqDevNo].krnl[0] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_short", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_preprocess_short FAILED\n"); return 0; } globalObj[jtrUniqDevNo].krnl[1] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_long", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_preprocess_long FAILED\n"); return 0; } globalObj[jtrUniqDevNo].krnl[2] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_iter", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_iter FAILED\n"); return 0; } globalObj[jtrUniqDevNo].krnl[3] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_postprocess", &err); if (err) { fprintf(stderr, "Create Kernel pbkdf2_postprocess FAILED\n"); return 0; } errMsg = "Create Buffer FAILED"; memAllocSz = 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint); memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4; globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu == (cl_mem)0) HANDLE_CLERROR(err,errMsg ); globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_WRITE_ONLY, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); memAllocSz = MAX_KEYS_PER_CRYPT * sizeof(temp_buf); memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4; globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); memAllocSz = 5 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint); memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4; globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err); if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0) HANDLE_CLERROR(err, errMsg); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 0 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu), "Set Kernel 0 Arg 1 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 0 Arg 3 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 1 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 1 Arg 1 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 2, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu), "Set Kernel 1 Arg 2 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 2 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 3 Arg 0 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu), "Set Kernel 3 Arg 1 :FAILED"); if (!local_work_size) find_best_workgroup(jtrUniqDevNo, quick_bechmark(jtrUniqDevNo)); else { size_t maxsize, maxsize2; globalObj[jtrUniqDevNo].lws = local_work_size; // Obey limits HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[0], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Error querying max LWS"); HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[1], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS"); if (maxsize2 > maxsize) maxsize = maxsize2; HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[2], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS"); if (maxsize2 > maxsize) maxsize = maxsize2; HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[3], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS"); if (maxsize2 > maxsize) maxsize = maxsize2; while (globalObj[jtrUniqDevNo].lws > maxsize) globalObj[jtrUniqDevNo].lws /= 2; if (options.verbosity > 3) fprintf(stderr, "Local worksize (LWS) forced to "Zu"\n", globalObj[jtrUniqDevNo].lws); globalObj[jtrUniqDevNo].exec_time_inv = 1; } if (!global_work_size) find_best_gws(jtrUniqDevNo, fmt); else { if (options.verbosity > 3) fprintf(stderr, "Global worksize (GWS) forced to "Zu"\n", global_work_size); fmt -> params.max_keys_per_crypt = global_work_size; fmt -> params.min_keys_per_crypt = max_lws(); } return globalObj[jtrUniqDevNo].lws; }