static void init(struct fmt_main *self) { char *kpc; global_work_size = MAX_KEYS_PER_CRYPT; opencl_init("$JOHN/md4_kernel.cl", ocl_gpu_id, platform_id); crypt_kernel = clCreateKernel(program[ocl_gpu_id], "md4", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); if( ((kpc = getenv("LWS")) == NULL) || (atoi(kpc) == 0)) { create_clobj(MD4_NUM_KEYS); opencl_find_best_workgroup(self); release_clobj(); }else { local_work_size = atoi(kpc); } if( (kpc = getenv("GWS")) == NULL){ max_keys_per_crypt = MD4_NUM_KEYS; create_clobj(MD4_NUM_KEYS); } else { if (atoi(kpc) == 0){ //user chose to die of boredom max_keys_per_crypt = MD4_NUM_KEYS; create_clobj(MD4_NUM_KEYS); find_best_kpc(); } else { max_keys_per_crypt = atoi(kpc); create_clobj(max_keys_per_crypt); } } fprintf(stderr, "Local work size (LWS) %d, Global work size (GWS) %d\n",(int)local_work_size, max_keys_per_crypt); self->params.max_keys_per_crypt = max_keys_per_crypt; }
static void init(struct fmt_main *self) { cl_ulong maxsize; size_t selected_gws; opencl_init_opt("$JOHN/kernels/pwsafe_kernel.cl", ocl_gpu_id, NULL); init_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_INIT_NAME, &ret_code); HANDLE_CLERROR(ret_code, "Error while creating init kernel"); crypt_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_RUN_NAME, &ret_code); HANDLE_CLERROR(ret_code, "Error while creating crypt kernel"); finish_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_FINISH_NAME, &ret_code); HANDLE_CLERROR(ret_code, "Error while creating finish kernel"); local_work_size = cpu(device_info[ocl_gpu_id]) ? 1 : 64; global_work_size = 0; opencl_get_user_preferences(CONFIG_NAME); //Initialize openCL tuning (library) for this format. opencl_init_auto_setup(STEP, ROUNDS_DEFAULT/8, 8, split_events, warn, &multi_profilingEvent[3], self, create_clobj, release_clobj, sizeof(pwsafe_pass), 0); self->methods.crypt_all = crypt_all_benchmark; selected_gws = global_work_size; /* Note: we ask for the kernels' max sizes, not the device's! */ maxsize = get_current_work_group_size(ocl_gpu_id, init_kernel); maxsize = MIN(get_current_work_group_size(ocl_gpu_id, crypt_kernel), maxsize); maxsize = MIN(get_current_work_group_size(ocl_gpu_id, finish_kernel), maxsize); while (local_work_size > maxsize) local_work_size >>= 1; self->params.max_keys_per_crypt = (global_work_size ? global_work_size: MAX_KEYS_PER_CRYPT); if (!local_work_size) { create_clobj(self->params.max_keys_per_crypt, self); find_best_lws(self, ocl_gpu_id); release_clobj(); } global_work_size = selected_gws; if (global_work_size) create_clobj(global_work_size, self); else //user chose to die of boredom find_best_gws(self, ocl_gpu_id); self->params.min_keys_per_crypt = local_work_size; self->params.max_keys_per_crypt = global_work_size; self->methods.crypt_all = crypt_all; if (options.verbosity > 2) fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size); }
static void fmt_ssha_init(struct fmt_main *pFmt) { char *kpc; opencl_init("$JOHN/ssha_opencl_kernel.cl", gpu_id); // create kernel to execute crypt_kernel = clCreateKernel(program[gpu_id], "sha1_crypt_kernel", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); if( (kpc = getenv("LWS")) == NULL){ create_clobj(SSHA_NUM_KEYS); find_best_workgroup(); release_clobj(); }else { local_work_size = atoi(kpc); } if( (kpc = getenv("KPC")) == NULL){ max_keys_per_crypt = SSHA_NUM_KEYS; create_clobj(SSHA_NUM_KEYS); } else { if (atoi(kpc) == 0){ //user chose to die of boredom max_keys_per_crypt = SSHA_NUM_KEYS; create_clobj(SSHA_NUM_KEYS); find_best_kpc(); } else { max_keys_per_crypt = atoi(kpc); create_clobj(max_keys_per_crypt); } } printf("Local work size (LWS) %d, Keys per crypt (KPC) %d\n",(int)local_work_size,max_keys_per_crypt); pFmt->params.max_keys_per_crypt = max_keys_per_crypt; }
static void fmt_ssha_init(struct fmt_main *self) { char *temp; cl_ulong maxsize; global_work_size = 0; opencl_init("$JOHN/ssha_kernel.cl", ocl_gpu_id, platform_id); // create kernel to execute crypt_kernel = clCreateKernel(program[ocl_gpu_id], "sha1_crypt_kernel", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size"); if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG))) local_work_size = atoi(temp); if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG))) global_work_size = atoi(temp); if ((temp = getenv("LWS"))) local_work_size = atoi(temp); if ((temp = getenv("GWS"))) global_work_size = atoi(temp); if (!local_work_size) { int temp = global_work_size; local_work_size = maxsize; global_work_size = global_work_size ? global_work_size : 4 * maxsize; create_clobj(global_work_size, self); opencl_find_best_workgroup_limit(self, maxsize); release_clobj(); global_work_size = temp; } if (local_work_size > maxsize) { fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize); local_work_size = maxsize; } if (!global_work_size) find_best_gws(getenv("GWS") == NULL ? 0 : 1, self); if (global_work_size < local_work_size) global_work_size = local_work_size; fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size); create_clobj(global_work_size, self); atexit(release_clobj); }
static void find_best_kpc(void){ int num; cl_event myEvent; cl_ulong startTime, endTime, tmpTime; int kernelExecTimeNs = 6969; cl_int ret_code; int optimal_kpc=2048; int i = 0; cl_uint *tmpbuffer; printf("Calculating best keys per crypt, this will take a while "); for( num=SSHA_NUM_KEYS; num > 4096 ; num -= 16384){ release_clobj(); create_clobj(num); advance_cursor(); queue_prof = clCreateCommandQueue( context[gpu_id], devices[gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); for (i=0; i < num; i++){ memcpy(&(saved_plain[i*PLAINTEXT_LENGTH]),"abacaeaf",PLAINTEXT_LENGTH); } clEnqueueWriteBuffer(queue_prof, mysalt, CL_TRUE, 0, SALT_SIZE, saved_salt, 0, NULL, NULL); clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH) * num, saved_plain, 0, NULL, NULL); ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent); if(ret_code != CL_SUCCESS){ printf("Error %d\n",ret_code); continue; } clFinish(queue_prof); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = endTime-startTime; tmpbuffer = malloc(sizeof(cl_uint) * num); clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * num, tmpbuffer, 0, NULL, &myEvent); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = tmpTime + (endTime-startTime); if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) { kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ; optimal_kpc = num; } free(tmpbuffer); clReleaseCommandQueue(queue_prof); } printf("Optimal keys per crypt %d\n(to avoid this test on next run, put \"" KPC_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS SUBSECTION_OPENCL "])\n", optimal_kpc, optimal_kpc); max_keys_per_crypt = optimal_kpc; release_clobj(); create_clobj(optimal_kpc); }
static void find_best_kpc(void){ int num; cl_event myEvent; cl_ulong startTime, endTime, tmpTime; int kernelExecTimeNs = 6969; cl_int ret_code; int optimal_kpc=2048; int i = 0; cl_uint *tmpbuffer; fprintf(stderr, "Calculating best keys per crypt, this will take a while "); for( num=MD4_NUM_KEYS; num > 4096 ; num -= 4096){ release_clobj(); create_clobj(num); advance_cursor(); queue_prof = clCreateCommandQueue( context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); for (i=0; i < num; i++){ memcpy(&(saved_plain[i * (PLAINTEXT_LENGTH + 1)]), "abcaaeaf", PLAINTEXT_LENGTH + 1); saved_plain[i * (PLAINTEXT_LENGTH + 1) + 8] = 0x80; } clEnqueueWriteBuffer(queue_prof, data_info, CL_TRUE, 0, sizeof(unsigned int)*2, datai, 0, NULL, NULL); clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH + 1) * num, saved_plain, 0, NULL, NULL); ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent); if(ret_code != CL_SUCCESS) { HANDLE_CLERROR(ret_code, "Error running kernel in find_best_KPC()"); continue; } clFinish(queue_prof); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = endTime-startTime; tmpbuffer = malloc(sizeof(cl_uint) * num); clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * num, tmpbuffer, 0, NULL, &myEvent); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = tmpTime + (endTime-startTime); if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) { kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ; optimal_kpc = num; } free(tmpbuffer); clReleaseCommandQueue(queue_prof); } fprintf(stderr, "Optimal keys per crypt %d\n(to avoid this test on next run do \"export GWS=%d\")\n",optimal_kpc,optimal_kpc); max_keys_per_crypt = optimal_kpc; release_clobj(); create_clobj(optimal_kpc); }
static void find_best_kpc(void){ int num; cl_event myEvent; cl_ulong startTime, endTime, tmpTime; int kernelExecTimeNs = INT_MAX; cl_int ret_code; int optimal_kpc=2048; int i = 0; cl_uint *tmpbuffer; fprintf(stderr, "Calculating best keys per crypt, this will take a while "); for( num=MAX_KEYS_PER_CRYPT; num >= 4096 ; num -= 4096){ release_clobj(); create_clobj(num); advance_cursor(); queue_prof = clCreateCommandQueue( context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); for (i=0; i < num; i++){ strcpy(&(saved_plain[i * keybuf_size]), tests[0].plaintext); } clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, keybuf_size * num, saved_plain, 0, NULL, NULL); ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent); if(ret_code != CL_SUCCESS){ fprintf(stderr, "Error %d\n",ret_code); continue; } clFinish(queue_prof); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = endTime-startTime; tmpbuffer = mem_alloc(sizeof(cl_uint) * num); clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * num, tmpbuffer, 0, NULL, &myEvent); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = tmpTime + (endTime-startTime); if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) { kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ; optimal_kpc = num; } MEM_FREE(tmpbuffer); clReleaseCommandQueue(queue_prof); } fprintf(stderr, "Optimal keys per crypt %d\n(to avoid this test on next run do export GWS=%d)\n",optimal_kpc,optimal_kpc); global_work_size = optimal_kpc; release_clobj(); create_clobj(optimal_kpc); }
void sha1_init(size_t user_kpc) { kpc = user_kpc; load_source(); createDevice(); createkernel(); create_clobj(); }
/* -- This function could be used to calculated the best num of keys per crypt for the given format -- */ static void find_best_gws(struct fmt_main * self, int sequential_id) { //Call the common function. common_find_best_gws( sequential_id, ROUNDS_DEFAULT, 0, (cpu(device_info[ocl_gpu_id]) ? 500000000ULL : 2400000000ULL) ); create_clobj(global_work_size, self); }
static void fmt_ssha_init(struct fmt_main *pFmt) { char *temp; opencl_init("$JOHN/ssha_kernel.cl", gpu_id, platform_id); // create kernel to execute crypt_kernel = clCreateKernel(program[gpu_id], "sha1_crypt_kernel", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG))) local_work_size = atoi(temp); if ((temp = getenv("LWS"))) local_work_size = atoi(temp); if (!local_work_size) { create_clobj(SSHA_NUM_KEYS); find_best_workgroup(); release_clobj(); } if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, KPC_CONFIG))) max_keys_per_crypt = atoi(temp); else max_keys_per_crypt = SSHA_NUM_KEYS; if ((temp = getenv("KPC"))) max_keys_per_crypt = atoi(temp); if (max_keys_per_crypt) { create_clobj(max_keys_per_crypt); } else { //user chose to die of boredom max_keys_per_crypt = SSHA_NUM_KEYS; create_clobj(SSHA_NUM_KEYS); find_best_kpc(); } printf("Local work size (LWS) %d, Keys per crypt (KPC) %d\n",(int)local_work_size,max_keys_per_crypt); pFmt->params.max_keys_per_crypt = max_keys_per_crypt; }
static void find_best_gws(int do_benchmark, struct fmt_main *self) { cl_event myEvent; cl_ulong startTime, endTime, tmpTime; int kernelExecTimeNs = INT_MAX; cl_int ret_code; int optimal_kpc=2048; int gws, i = 0; cl_uint *tmpbuffer; for(gws = local_work_size << 2; gws <= 8*1024*1024; gws <<= 1) { create_clobj(gws, self); advance_cursor(); queue_prof = clCreateCommandQueue( context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); for (i=0; i < gws; i++) { memcpy(&(saved_plain[i*PLAINTEXT_LENGTH]),"abacaeaf",PLAINTEXT_LENGTH); } clEnqueueWriteBuffer(queue_prof, mysalt, CL_TRUE, 0, SALT_SIZE, saved_salt, 0, NULL, NULL); clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH) * gws, saved_plain, 0, NULL, NULL); ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent); if(ret_code != CL_SUCCESS) { // We hit some resource limit so we end here. release_clobj(); break; } clFinish(queue_prof); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = endTime-startTime; tmpbuffer = malloc(sizeof(cl_uint) * gws); clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * gws, tmpbuffer, 0, NULL, &myEvent); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END , sizeof(cl_ulong), &endTime , NULL); tmpTime = tmpTime + (endTime-startTime); if (do_benchmark) fprintf(stderr, "%10d %10llu c/s %-.2f us\n", gws, gws * 1000000000ULL / tmpTime, tmpTime / 1000.0); if( ((int)( ((float) (tmpTime) / gws) * 10 )) <= kernelExecTimeNs) { kernelExecTimeNs = ((int) (((float) (tmpTime) / gws) * 10) ) ; optimal_kpc = gws; } MEM_FREE(tmpbuffer); clReleaseCommandQueue(queue_prof); release_clobj(); } global_work_size = optimal_kpc; }
static void init(struct fmt_main *self) { cl_ulong maxsize; opencl_init("$JOHN/kernels/pbkdf2_hmac_sha512_unsplit_kernel.cl", ocl_gpu_id); set_lws_gws(DEFAULT_LWS,DEFAULT_GWS); crypt_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_NAME, &cl_error); HANDLE_CLERROR(cl_error, "Error creating kernel"); create_clobj(global_work_size, self); /* Note: we ask for the kernels' max sizes, not the device's! */ HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max workgroup size"); while (local_work_size > maxsize) local_work_size >>= 1; self->params.min_keys_per_crypt = local_work_size; self->params.max_keys_per_crypt = global_work_size; }
cl_ulong gws_test(int gws) { cl_ulong startTime, endTime, run_time; cl_command_queue queue_prof; cl_event myEvent; cl_int ret_code; int i; int num = VF * gws; create_clobj(gws); queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); for (i = 0; i < num; i++) set_key(rar_fmt.params.tests[0].plaintext, i); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_salt, BLOCK_IF_DEBUG, 0, 8, saved_salt, 0, NULL, NULL), "Failed transferring salt"); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_key, BLOCK_IF_DEBUG, 0, UNICODE_LENGTH * num, saved_key, 0, NULL, NULL), "Failed transferring keys"); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_len, BLOCK_IF_DEBUG, 0, sizeof(int) * num, saved_len, 0, NULL, NULL), "Failed transferring lengths"); ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent); if (ret_code != CL_SUCCESS) { fprintf(stderr, "Error: %s\n", get_error_name(ret_code)); clReleaseCommandQueue(queue_prof); release_clobj(); return 0; } HANDLE_CLERROR(clFinish(queue_prof), "Failed running kernel"); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); run_time = endTime - startTime; HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_aes_iv, BLOCK_IF_DEBUG, 0, 16 * num, aes_iv, 0, NULL, &myEvent), "Failed reading iv back"); HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_aes_key, BLOCK_IF_DEBUG, 0, 16 * num, aes_key, 0, NULL, &myEvent), "Failed reading key back"); HANDLE_CLERROR(clFinish(queue_prof), "Failed reading results back"); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); clReleaseCommandQueue(queue_prof); release_clobj(); return (run_time + endTime - startTime); }
/* ------- Initialization ------- */ static void init(struct fmt_main * self) { char * tmp_value; char * task = "$JOHN/sha256_kernel.cl"; opencl_init_dev(ocl_gpu_id, platform_id); source_in_use = device_info[ocl_gpu_id]; if ((tmp_value = getenv("_TYPE"))) source_in_use = atoi(tmp_value); opencl_build_kernel(task, ocl_gpu_id); // create kernel(s) to execute crypt_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_crypt", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); cmp_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_cmp", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel_cmp. Double-check kernel name?"); global_work_size = get_task_max_size(); local_work_size = 0; if (source_in_use != device_info[ocl_gpu_id]) { device_info[ocl_gpu_id] = source_in_use; fprintf(stderr, "Selected runtime id %d, source (%s)\n", source_in_use, task); } if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG))) local_work_size = atoi(tmp_value); if ((tmp_value = getenv("LWS"))) local_work_size = atoi(tmp_value); //Check if local_work_size is a valid number. if (local_work_size > get_task_max_work_group_size()){ fprintf(stderr, "Error: invalid local work size (LWS). Max value allowed is: %zd\n" , get_task_max_work_group_size()); local_work_size = 0; //Force find a valid number. } self->params.max_keys_per_crypt = global_work_size; if (!local_work_size) { local_work_size = get_task_max_work_group_size(); create_clobj(global_work_size, self); find_best_workgroup(self); release_clobj(); } if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG))) global_work_size = atoi(tmp_value); if ((tmp_value = getenv("GWS"))) global_work_size = atoi(tmp_value); //Check if a valid multiple is used. global_work_size = get_multiple(global_work_size, local_work_size); if (global_work_size) create_clobj(global_work_size, self); else { //user chose to die of boredom global_work_size = get_task_max_size(); find_best_gws(self); } fprintf(stderr, "Local work size (LWS) %d, global work size (GWS) %zd\n", (int) local_work_size, global_work_size); self->params.max_keys_per_crypt = global_work_size; }
/* -- This function could be used to calculated the best num of keys per crypt for the given format -- */ static void find_best_gws(struct fmt_main * self) { size_t num = 0; cl_ulong run_time, min_time = CL_ULONG_MAX; int optimal_gws = local_work_size, step = STEP; int do_benchmark = 0; unsigned int SHAspeed, bestSHAspeed = 0; unsigned long long int max_run_time = 1000000000ULL; char *tmp_value; if ((tmp_value = getenv("STEP"))){ step = atoi(tmp_value); do_benchmark = 1; } step = get_multiple(step, local_work_size); if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, DUR_CONFIG))) max_run_time = atoi(tmp_value) * 1000000000UL; fprintf(stderr, "Calculating best global work size (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, max_run_time / 1000000000ULL); if (do_benchmark) fprintf(stderr, "Raw speed figures including buffer transfers:\n"); for (num = get_step(num, step, 1); num; num = get_step(num, step, 0)) { //Check if hardware can handle the size we are going to try now. if (sizeof(sha256_password) * num * 1.2 > get_max_mem_alloc_size(ocl_gpu_id)) break; if (! (run_time = gws_test(num, self))) continue; if (!do_benchmark) advance_cursor(); SHAspeed = num / (run_time / 1000000000.); if (run_time < min_time) min_time = run_time; if (do_benchmark) { fprintf(stderr, "gws: %8zu\t%12lu c/s %8.3f ms per crypt_all()", num, (long) (num / (run_time / 1000000000.)), (float) run_time / 1000000.); if (run_time > max_run_time) { fprintf(stderr, " - too slow\n"); break; } } else { if (run_time > min_time * 20 || run_time > max_run_time) break; } if (((long) SHAspeed - bestSHAspeed) > 10000) { if (do_benchmark) fprintf(stderr, "+"); bestSHAspeed = SHAspeed; optimal_gws = num; } if (do_benchmark) fprintf(stderr, "\n"); } fprintf(stderr, "Optimal global work size %d\n", optimal_gws); fprintf(stderr, "(to avoid this test on next run, put \"" GWS_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS SUBSECTION_OPENCL "])\n", optimal_gws); global_work_size = optimal_gws; create_clobj(optimal_gws, self); }
//Do the proper test using different sizes. static cl_ulong gws_test(size_t num, struct fmt_main * self) { cl_event myEvent; cl_int ret_code; cl_uint *tmpbuffer; cl_ulong startTime, endTime, runtime; int i; //Prepare buffers. create_clobj(num, self); tmpbuffer = mem_alloc(sizeof(sha256_hash) * num); if (tmpbuffer == NULL) { fprintf(stderr, "Malloc failure in find_best_gws\n"); exit(EXIT_FAILURE); } queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue"); // Set keys for (i = 0; i < num; i++) { set_key("aaabaabaaa", i); } //** Get execution time **// HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, sizeof(sha256_password) * num, plaintext, 0, NULL, &myEvent), "Failed in clEnqueueWriteBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime = endTime - startTime; //** Get execution time **// ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; //** Get execution time **// HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0, sizeof(uint32_t) * num, tmpbuffer, 0, NULL, &myEvent), "Failed in clEnqueueReadBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; MEM_FREE(tmpbuffer); HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), "Failed in clReleaseCommandQueue"); release_clobj(); if (ret_code != CL_SUCCESS) { if (ret_code != CL_INVALID_WORK_GROUP_SIZE) fprintf(stderr, "Error %d\n", ret_code); return 0; } return runtime; }
static void init(struct fmt_main *self) { #ifdef CL_VERSION_1_0 char *temp; cl_ulong maxsize; global_work_size = 0; opencl_init("$JOHN/rar_kernel.cl", ocl_gpu_id, platform_id); // create kernel to execute crypt_kernel = clCreateKernel(program[ocl_gpu_id], "SetCryptKeys", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); /* We mimic the lengths of cRARk for comparisons */ if (get_device_type(ocl_gpu_id) == CL_DEVICE_TYPE_GPU) { #ifndef DEBUG self->params.benchmark_comment = " (6 characters)"; #endif self->params.tests = gpu_tests; #if defined(DEBUG) && !defined(ALWAYS_OPENCL) fprintf(stderr, "Note: will use CPU for some self-tests, and Single mode.\n"); #endif } if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG))) local_work_size = atoi(temp); if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG))) global_work_size = atoi(temp); if ((temp = getenv("LWS"))) local_work_size = atoi(temp); if ((temp = getenv("GWS"))) global_work_size = atoi(temp); /* Note: we ask for this kernel's max size, not the device's! */ HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size"); #ifdef DEBUG fprintf(stderr, "Max allowed local work size %d\n", (int)maxsize); #endif if (!local_work_size) { if (get_device_type(ocl_gpu_id) == CL_DEVICE_TYPE_CPU) { if (get_platform_vendor_id(platform_id) == INTEL) local_work_size = 8; else local_work_size = 1; } else { local_work_size = 64; } } if (local_work_size > maxsize) { fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize); local_work_size = maxsize; } if (!global_work_size) find_best_gws(temp == NULL ? 0 : 1); if (global_work_size < local_work_size) global_work_size = local_work_size; fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size); create_clobj(global_work_size); #ifdef DEBUG { cl_ulong loc_mem_size; HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(loc_mem_size), &loc_mem_size, NULL), "Query local memory usage"); fprintf(stderr, "Kernel using %lu bytes of local memory out of %lu available\n", loc_mem_size, get_local_memory_size(ocl_gpu_id)); } #endif atexit(release_clobj); *mkpc = VF * global_work_size; #endif /* OpenCL */ #if defined (_OPENMP) omp_t = omp_get_max_threads(); self->params.min_keys_per_crypt *= omp_t; #ifndef CL_VERSION_1_0 /* OpenCL gets to decide */ *mkpc = omp_t * OMP_SCALE * MAX_KEYS_PER_CRYPT; #endif init_locks(); #endif /* _OPENMP */ if (options.utf8) self->params.plaintext_length = PLAINTEXT_LENGTH * 3; unpack_data = mem_calloc_tiny(sizeof(unpack_data_t) * omp_t, MEM_ALIGN_WORD); cracked = mem_calloc_tiny(sizeof(*cracked) * *mkpc, MEM_ALIGN_WORD); #ifndef CL_VERSION_1_0 saved_key = mem_calloc_tiny(UNICODE_LENGTH * *mkpc, MEM_ALIGN_NONE); saved_len = mem_calloc_tiny(sizeof(*saved_len) * *mkpc, MEM_ALIGN_WORD); saved_salt = mem_calloc_tiny(8, MEM_ALIGN_NONE); aes_key = mem_calloc_tiny(16 * *mkpc, MEM_ALIGN_NONE); aes_iv = mem_calloc_tiny(16 * *mkpc, MEM_ALIGN_NONE); #endif /* OpenSSL init */ init_aesni(); SSL_load_error_strings(); SSL_library_init(); OpenSSL_add_all_algorithms(); #ifndef __APPLE__ atexit(openssl_cleanup); #endif /* CRC-32 table init, do it before we start multithreading */ { CRC32_t crc; CRC32_Init(&crc); } }
/* ------- Initialization ------- */ static void init(struct fmt_main * self) { char * tmp_value; char * task = "$JOHN/cryptsha512_kernel_DEFAULT.cl"; uint64_t startTime, runtime; opencl_init_dev(ocl_gpu_id, platform_id); startTime = (unsigned long) time(NULL); source_in_use = device_info[ocl_gpu_id]; if ((tmp_value = getenv("_TYPE"))) source_in_use = atoi(tmp_value); if ((tmp_value = getenv("_FAST"))) fast_mode = TRUE; if (use_local(source_in_use)) task = "$JOHN/cryptsha512_kernel_LOCAL.cl"; else if (gpu(source_in_use)) { fprintf(stderr, "Building the kernel, this could take a while\n"); task = "$JOHN/cryptsha512_kernel_GPU.cl"; } fflush(stdout); opencl_build_kernel(task, ocl_gpu_id); if ((runtime = (unsigned long) (time(NULL) - startTime)) > 2UL) fprintf(stderr, "Elapsed time: %lu seconds\n", runtime); fflush(stdout); // create kernel(s) to execute crypt_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_crypt", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); if (gpu(source_in_use) || use_local(source_in_use)) { prepare_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_prepare", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel_prepare. Double-check kernel name?"); final_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_final", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel_final. Double-check kernel name?"); } global_work_size = get_task_max_size(); local_work_size = get_default_workgroup(); if (source_in_use != device_info[ocl_gpu_id]) fprintf(stderr, "Selected runtime id %d, source (%s)\n", source_in_use, task); if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG))) local_work_size = atoi(tmp_value); if ((tmp_value = getenv("LWS"))) local_work_size = atoi(tmp_value); //Check if local_work_size is a valid number. if (local_work_size > get_task_max_work_group_size()){ local_work_size = 0; //Force find a valid number. } self->params.max_keys_per_crypt = global_work_size; if (!local_work_size) { local_work_size = get_task_max_work_group_size(); create_clobj(global_work_size, self); find_best_workgroup(self); release_clobj(); } if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG))) global_work_size = atoi(tmp_value); if ((tmp_value = getenv("GWS"))) global_work_size = atoi(tmp_value); //Check if a valid multiple is used. global_work_size = get_multiple(global_work_size, local_work_size); if (global_work_size) create_clobj(global_work_size, self); else { //user chose to die of boredom global_work_size = get_task_max_size(); find_best_gws(self); } fprintf(stderr, "Local work size (LWS) %d, global work size (GWS) %zd\n", (int) local_work_size, global_work_size); self->params.max_keys_per_crypt = global_work_size; }
static void init(struct fmt_main *self) { char *temp; cl_ulong maxsize, maxsize2; char build_opts[64]; global_work_size = 0; snprintf(build_opts, sizeof(build_opts), "-DHASH_LOOPS=%u -DUNICODE_LENGTH=%u %s", HASH_LOOPS, UNICODE_LENGTH, (options.flags & FLG_VECTORIZE) ? "-DVECTORIZE" : (options.flags & FLG_SCALAR) ? "-DSCALAR" : ""); opencl_init_opt("$JOHN/office2007_kernel.cl", ocl_gpu_id, platform_id, build_opts); // create kernel to execute GenerateSHA1pwhash = clCreateKernel(program[ocl_gpu_id], "GenerateSHA1pwhash", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); crypt_kernel = clCreateKernel(program[ocl_gpu_id], "HashLoop", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); Generate2007key = clCreateKernel(program[ocl_gpu_id], "Generate2007key", &ret_code); HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?"); if (options.flags & FLG_VECTORIZE) { /* Run vectorized code */ VF = 4; self->params.algorithm_name = "OpenCL 4x"; } if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG))) local_work_size = atoi(temp); if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG))) global_work_size = atoi(temp); if ((temp = getenv("LWS"))) local_work_size = atoi(temp); if ((temp = getenv("GWS"))) global_work_size = atoi(temp); /* Note: we ask for the kernels' max sizes, not the device's! */ HANDLE_CLERROR(clGetKernelWorkGroupInfo(GenerateSHA1pwhash, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size"); HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Query max work group size"); if (maxsize2 < maxsize) maxsize = maxsize2; HANDLE_CLERROR(clGetKernelWorkGroupInfo(Generate2007key, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Query max work group size"); if (maxsize2 < maxsize) maxsize = maxsize2; #if 0 /* Our use of local memory sets a limit for LWS */ maxsize2 = get_local_memory_size(ocl_gpu_id) / (24 * VF); while (maxsize > maxsize2) maxsize >>= 1; #endif /* maxsize is the lowest figure from the three different kernels */ if (!local_work_size) { if (getenv("LWS")) { /* LWS was explicitly set to 0 */ int temp = global_work_size; local_work_size = maxsize; global_work_size = global_work_size ? global_work_size : 4 * maxsize; create_clobj(global_work_size, self); opencl_find_best_workgroup_limit(self, maxsize); release_clobj(); global_work_size = temp; } else { if (cpu(device_info[ocl_gpu_id])) { if (get_platform_vendor_id(platform_id) == DEV_INTEL) local_work_size = MIN(maxsize, 8); else local_work_size = 1; } else local_work_size = MIN(maxsize, 64); } } if (local_work_size > maxsize) { fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize); local_work_size = maxsize; } if (!global_work_size) find_best_gws(getenv("GWS") == NULL ? 0 : 1, self); if (global_work_size < local_work_size) global_work_size = local_work_size; fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size); create_clobj(global_work_size, self); atexit(release_clobj); if (options.utf8) self->params.plaintext_length = MIN(125, 3 * PLAINTEXT_LENGTH); }
static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self) { cl_ulong startTime, endTime; cl_command_queue queue_prof; cl_event Event[6]; cl_int ret_code; int i; size_t scalar_gws = VF * gws; create_clobj(gws, self); queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); for (i = 0; i < scalar_gws; i++) set_key(tests[0].plaintext, i); set_salt(get_salt(tests[0].ciphertext)); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_key, CL_TRUE, 0, UNICODE_LENGTH * scalar_gws, saved_key, 0, NULL, &Event[0]), "Failed transferring keys"); HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_len, CL_TRUE, 0, sizeof(int) * scalar_gws, saved_len, 0, NULL, &Event[1]), "Failed transferring lengths"); HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, GenerateSHA1pwhash, 1, NULL, &scalar_gws, &local_work_size, 0, NULL, &Event[2]), "running kernel"); for (i = 0; i < 50000 / HASH_LOOPS - 1; i++) HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "running kernel"); HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &Event[3]), "running kernel"); HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, Generate2007key, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &Event[4]), "running kernel"); HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_key, CL_TRUE, 0, 16 * scalar_gws, key, 0, NULL, &Event[5]), "failed in reading key back"); #if 0 HANDLE_CLERROR(clGetEventProfilingInfo(Event[2], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL), "Failed to get profiling info"); HANDLE_CLERROR(clGetEventProfilingInfo(Event[2], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed to get profiling info"); fprintf(stderr, "GenerateSHA1pwhash kernel duration: %llu us, ", (endTime-startTime)/1000ULL); #endif HANDLE_CLERROR(clGetEventProfilingInfo(Event[3], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL), "Failed to get profiling info"); HANDLE_CLERROR(clGetEventProfilingInfo(Event[3], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed to get profiling info"); if (do_benchmark) fprintf(stderr, "%.2f ms x %u = %.2f s\t", (float)((endTime - startTime)/1000000.), 50000/HASH_LOOPS, (float)(50000/HASH_LOOPS) * (endTime - startTime) / 1000000000.); /* 200 ms duration limit for GCN to avoid ASIC hangs */ if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) { if (do_benchmark) fprintf(stderr, "- exceeds 200 ms\n"); clReleaseCommandQueue(queue_prof); release_clobj(); return 0; } #if 0 HANDLE_CLERROR(clGetEventProfilingInfo(Event[4], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL), "Failed to get profiling info"); HANDLE_CLERROR(clGetEventProfilingInfo(Event[4], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed to get profiling info"); fprintf(stderr, "Generate2007key kernel duration: %llu us\n", (endTime-startTime)/1000ULL); #endif HANDLE_CLERROR(clGetEventProfilingInfo(Event[0], CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed to get profiling info"); HANDLE_CLERROR(clGetEventProfilingInfo(Event[5], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed to get profiling info"); clReleaseCommandQueue(queue_prof); release_clobj(); return (endTime - startTime); }