static void find_best_workgroup(int jtrUniqDevNo) { size_t _lws=0; cl_device_type dTyp; cl_command_queue cmdq; cl_int err; cl_uint *dcc_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536)); cl_uint *dcc2_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536)); cl_uint salt_api[9], length = 10; event_ctr = 0; //HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info"); dTyp = get_device_type(jtrUniqDevNo); if (dTyp == CL_DEVICE_TYPE_CPU) globalObj[jtrUniqDevNo].lws = 1; else globalObj[jtrUniqDevNo].lws = 16; ///Set Dummy DCC hash , unicode salt and ascii salt(username) length memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536)); memset(salt_api, 0xfe, 9 * sizeof(cl_uint)); cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_CLERROR(err, "Error creating command queue"); PROFILE = 1; kernelExecTimeNs = CL_ULONG_MAX; ///Find best local work size while (1) { _lws = globalObj[jtrUniqDevNo].lws; if (dTyp == CL_DEVICE_TYPE_CPU) { exec_pbkdf2(dcc_hash_host, salt_api, length, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq); globalObj[jtrUniqDevNo].exec_time_inv = globalObj[jtrUniqDevNo].exec_time_inv / 16; } else { exec_pbkdf2(dcc_hash_host, salt_api, length, dcc2_hash_host, ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536), jtrUniqDevNo, cmdq); globalObj[jtrUniqDevNo].exec_time_inv *= ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536) / 65536; } if (globalObj[jtrUniqDevNo].lws <= _lws) break; } PROFILE = 0; if (options.verbosity > 2) { fprintf(stderr, "Optimal Work Group Size:%d\n", (int)globalObj[jtrUniqDevNo].lws); fprintf(stderr, "Kernel Execution Speed (Higher is better):%Lf\n", globalObj[jtrUniqDevNo].exec_time_inv); } MEM_FREE(dcc_hash_host); MEM_FREE(dcc2_hash_host); HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed"); }
static unsigned int quick_bechmark(int jtrUniqDevNo) { cl_device_type dTyp; cl_command_queue cmdq; cl_int err; cl_uint *dcc_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * 4096); cl_uint *dcc2_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * 4096); cl_uint *hmac_sha1_out = (cl_uint*)mem_alloc(5 * sizeof(cl_uint) * 4096); cl_uint salt_api[9], length = 10; event_ctr = 0; //HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info"); dTyp = get_device_type(jtrUniqDevNo); if (dTyp == CL_DEVICE_TYPE_CPU) globalObj[jtrUniqDevNo].lws = 1; else globalObj[jtrUniqDevNo].lws = 64; ///Set Dummy DCC hash , unicode salt and ascii salt(username) length memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * 4096); memset(salt_api, 0xfe, 9 * sizeof(cl_uint)); cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_CLERROR(err, "Error creating command queue"); PROFILE = 1; kernelExecTimeNs = CL_ULONG_MAX; exec_pbkdf2(dcc_hash_host, salt_api, length, 2048, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq, hmac_sha1_out); PROFILE = 0; if (globalObj[jtrUniqDevNo].exec_time_inv < 15) return 4; else if (globalObj[jtrUniqDevNo].exec_time_inv < 25) return 2; else return 1; MEM_FREE(dcc_hash_host); MEM_FREE(dcc2_hash_host); MEM_FREE(hmac_sha1_out); HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed"); }
void pbkdf2_divide_work(cl_uint *pass_api, cl_uint *salt_api, cl_uint saltlen_api, cl_uint *hash_out_api, cl_uint num) { double total_exec_time_inv = 0; int i; unsigned int work_part, work_offset = 0, lws_max = max_lws(); cl_int ret; event_ctr = 0; memset(hash_out_api, 0, num * sizeof(cl_uint)); /// Make num multiple of lws_max if (num % lws_max != 0) num = (num / lws_max + 1) * lws_max; ///Divide work only if number of keys is greater than 8192, else use first device selected if (num > 8192) { ///Calculates t0tal Kernel Execution Speed for (i = 0; i < active_dev_ctr; ++i) total_exec_time_inv += globalObj[ocl_device_list[i]].exec_time_inv; ///Calculate work division ratio for (i = 0; i < active_dev_ctr; ++i) globalObj[ocl_device_list[i]].exec_time_inv /= total_exec_time_inv; ///Divide memory and work for (i = 0; i < active_dev_ctr; ++i) { if (i == active_dev_ctr - 1) { work_part = num - work_offset; if (work_part % lws_max != 0) work_part = (work_part / lws_max + 1) * lws_max; } else { work_part = num * globalObj[ocl_device_list[i]].exec_time_inv; if (work_part % lws_max != 0) work_part = (work_part / lws_max + 1) * lws_max; } if ((int)work_part <= 0) work_part = lws_max; ///call to exec_pbkdf2() #ifdef _DEBUG printf("Work Offset:%d Work Part Size:%d %d\n",work_offset,work_part,event_ctr); #endif exec_pbkdf2(pass_api + 4 * work_offset, salt_api, saltlen_api, hash_out_api + 4 * work_offset, work_part, ocl_device_list[i], queue[ocl_device_list[i]]); work_offset += work_part; } ///Synchronize Device memory and Host memory for (i = active_dev_ctr - 1; i >= 0; --i) HANDLE_CLERROR(clFlush(queue[ocl_device_list[i]]), "Flush Error"); for (i = 0; i < active_dev_ctr; ++i) { while (1) { HANDLE_CLERROR(clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL), "Error in Get Event Info"); if ((ret) == CL_COMPLETE) break; #ifdef _DEBUG printf("%d%d ", ret, i); #endif } } for (i = 0; i < active_dev_ctr; ++i) HANDLE_CLERROR(clFinish(queue[ocl_device_list[i]]), "Finish Error"); } else { exec_pbkdf2(pass_api, salt_api, saltlen_api, hash_out_api, num, ocl_device_list[0], queue[ocl_device_list[0]]); HANDLE_CLERROR(clFinish(queue[ocl_device_list[0]]), "Finish Error"); } }
void pbkdf2_divide_work(cl_uint *pass_api, cl_uint *salt_api, cl_uint saltlen_api, unsigned int iter_cnt, cl_uint *hash_out_api, cl_uint *hmac_sha1_api, cl_uint num) { double total_exec_time_inv = 0; int i; unsigned int work_part, work_offset = 0, lws_max = max_lws(); cl_int ret; #ifdef _DEBUG struct timeval startc, endc; #endif event_ctr = 0; memset(hash_out_api, 0, num * sizeof(cl_uint)); /// Make num multiple of lws_max if (num % lws_max != 0) num = (num / lws_max + 1) * lws_max; ///Divide work only if number of keys is greater than 8192, else use first device selected if (num > 8192) { ///Calculates t0tal Kernel Execution Speed for (i = 0; i < active_dev_ctr; ++i) total_exec_time_inv += globalObj[gpu_device_list[i]].exec_time_inv; ///Calculate work division ratio for (i = 0; i < active_dev_ctr; ++i) globalObj[gpu_device_list[i]].exec_time_inv /= total_exec_time_inv; ///Divide memory and work for (i = 0; i < active_dev_ctr; ++i) { if (i == active_dev_ctr - 1) { work_part = num - work_offset; if (work_part % lws_max != 0) work_part = (work_part / lws_max + 1) * lws_max; } else { work_part = num * globalObj[gpu_device_list[i]].exec_time_inv; if (work_part % lws_max != 0) work_part = (work_part / lws_max + 1) * lws_max; } if ((int)work_part <= 0) work_part = lws_max; #ifdef _DEBUG gettimeofday(&startc, NULL) ; fprintf(stderr, "Work Offset:%d Work Part Size:%d Event No:%d",work_offset,work_part,event_ctr); #endif ///call to exec_pbkdf2() exec_pbkdf2(pass_api + 4 * work_offset, salt_api, saltlen_api, iter_cnt, hash_out_api + 4 * work_offset, work_part, gpu_device_list[i], queue[gpu_device_list[i]], hmac_sha1_api + 5 * work_offset); work_offset += work_part; #ifdef _DEBUG gettimeofday(&endc, NULL); fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ; #endif } ///Synchronize all kernels for (i = active_dev_ctr - 1; i >= 0; --i) HANDLE_CLERROR(clFlush(queue[gpu_device_list[i]]), "Flush Error"); for (i = 0; i < active_dev_ctr; ++i) { while (1) { HANDLE_CLERROR(clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL), "Error in Get Event Info"); if ((ret) == CL_COMPLETE) break; #ifdef _DEBUG printf("%d%d ", ret, i); #endif } } event_ctr = work_part = work_offset = 0; ///Read results back from all kernels for (i = 0; i < active_dev_ctr; ++i) { if (i == active_dev_ctr - 1) { work_part = num - work_offset; if (work_part % lws_max != 0) work_part = (work_part / lws_max + 1) * lws_max; } else { work_part = num * globalObj[gpu_device_list[i]].exec_time_inv; if (work_part % lws_max != 0) work_part = (work_part / lws_max + 1) * lws_max; } if ((int)work_part <= 0) work_part = lws_max; #ifdef _DEBUG gettimeofday(&startc, NULL) ; fprintf(stderr, "Work Offset:%d Work Part Size:%d Event No:%d",work_offset,work_part,event_ctr); #endif ///Read results back from device HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_device_list[i]], globalObj[gpu_device_list[i]].gpu_buffer.hash_out_gpu, CL_FALSE, 0, 4 * work_part * sizeof(cl_uint), hash_out_api + 4 * work_offset, 0, NULL, &events[event_ctr++]), "Write :FAILED"); work_offset += work_part; #ifdef _DEBUG gettimeofday(&endc, NULL); fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ; #endif } for (i = 0; i < active_dev_ctr; ++i) HANDLE_CLERROR(clFinish(queue[gpu_device_list[i]]), "Finish Error"); } else { exec_pbkdf2(pass_api, salt_api, saltlen_api, iter_cnt, hash_out_api, num, gpu_device_list[0], queue[gpu_device_list[0]], hmac_sha1_api); HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_device_list[0]], globalObj[gpu_device_list[0]].gpu_buffer.hash_out_gpu, CL_FALSE, 0, 4*num*sizeof(cl_uint), hash_out_api, 0, NULL, NULL), "Write :FAILED"); HANDLE_CLERROR(clFinish(queue[gpu_device_list[0]]), "Finish Error"); } }