static int crypt_all(int *pcount, struct db_salt *salt) { const int count = *pcount; int i; size_t scalar_gws; global_work_size = ((count + (v_width * local_work_size - 1)) / (v_width * local_work_size)) * local_work_size; scalar_gws = global_work_size * v_width; #if 0 fprintf(stderr, "%s(%d) lws "Zu" gws "Zu" sgws "Zu"\n", __FUNCTION__, count, local_work_size, global_work_size, scalar_gws); #endif /// Copy data to gpu if (new_keys) { HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE, 0, key_buf_size, inbuffer, 0, NULL, NULL), "Copy data to gpu"); new_keys = 0; } /// Run kernels HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], pbkdf1_init, 1, NULL, &global_work_size, &local_work_size, 0, NULL, firstEvent), "Run initial kernel"); for (i = 0; i < LOOP_COUNT; i++) { HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], pbkdf1_loop, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run loop kernel"); HANDLE_CLERROR(clFinish(queue[gpu_id]), "Error running loop kernel"); opencl_process_event(); } HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], pbkdf1_final, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run intermediate kernel"); /// Read the result back HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0, sizeof(pbkdf1_out) * scalar_gws, host_crack, 0, NULL, NULL), "Copy result back"); return count; }
static int crypt_all(int *pcount, struct db_salt *salt) { const int count = *pcount; int i, index; size_t *lws = local_work_size ? &local_work_size : NULL; global_work_size = local_work_size ? (count + local_work_size - 1) / local_work_size * local_work_size : count; if (any_cracked) { memset(cracked, 0, cracked_size); any_cracked = 0; } // Copy data to gpu HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE, 0, insize, inbuffer, 0, NULL, NULL), "Copy data to gpu"); // Run 1st kernel HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], sevenzip_init, 1, NULL, &global_work_size, lws, 0, NULL, NULL), "Run init kernel"); // Run loop kernel for (i = 0; i < LOOP_COUNT; i++) { HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL, &global_work_size, lws, 0, NULL, NULL), "Run loop kernel"); HANDLE_CLERROR(clFinish(queue[gpu_id]), "Error running loop kernel"); opencl_process_event(); } // Read the result back HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0, outsize, outbuffer, 0, NULL, NULL), "Copy result back"); #ifdef _OPENMP #pragma omp parallel for #endif for (index = 0; index < count; index++) { /* decrypt and check */ if(sevenzip_decrypt(outbuffer[index].key, cur_salt->data) == 0) { cracked[index] = 1; #ifdef _OPENMP #pragma omp atomic #endif any_cracked |= 1; } } return count; }
static int crypt_all(int *pcount, struct db_salt *salt) { int count = *pcount; int i = 0; global_work_size = (count + local_work_size - 1) / local_work_size * local_work_size; ///Copy data to GPU memory if (new_keys) HANDLE_CLERROR(clEnqueueWriteBuffer (queue[ocl_gpu_id], mem_in, CL_FALSE, 0, insize, host_pass, 0, NULL, NULL), "Copy memin"); HANDLE_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], mem_salt, CL_FALSE, 0, saltsize, host_salt, 0, NULL, NULL), "Copy memsalt"); HANDLE_CLERROR(clEnqueueNDRangeKernel (queue[ocl_gpu_id], init_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Set ND range"); ///Run kernel for(i = 0; i < 8; i++) { HANDLE_CLERROR(clEnqueueNDRangeKernel (queue[ocl_gpu_id], crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Set ND range"); HANDLE_CLERROR(clFinish(queue[ocl_gpu_id]), "Error running loop kernel"); opencl_process_event(); } HANDLE_CLERROR(clEnqueueNDRangeKernel (queue[ocl_gpu_id], finish_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Set ND range"); HANDLE_CLERROR(clEnqueueReadBuffer(queue[ocl_gpu_id], mem_out, CL_FALSE, 0, outsize, host_hash, 0, NULL, NULL), "Copy data back"); ///Await completion of all the above HANDLE_CLERROR(clFinish(queue[ocl_gpu_id]), "clFinish error"); new_keys = 0; return count; }
static int crypt_all_benchmark(int *pcount, struct db_salt *salt) { int count = *pcount; int i; global_work_size = (count + local_work_size - 1) / local_work_size * local_work_size; BENCH_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], mem_in, CL_FALSE, 0, insize, host_pass, 0, NULL, &multi_profilingEvent[0]), "Copy memin"); BENCH_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], mem_salt, CL_FALSE, 0, saltsize, host_salt, 0, NULL, &multi_profilingEvent[1]), "Copy memsalt"); ///Run the init kernel BENCH_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], init_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &multi_profilingEvent[2]), "Set ND range"); ///Run split kernel for(i = 0; i < 3; i++) { BENCH_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &multi_profilingEvent[split_events[i]]), "Set ND range"); //3, 4, 5 BENCH_CLERROR(clFinish(queue[ocl_gpu_id]), "Error running loop kernel"); opencl_process_event(); } ///Run the finish kernel BENCH_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], finish_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &multi_profilingEvent[6]), "Set ND range"); BENCH_CLERROR(clEnqueueReadBuffer(queue[ocl_gpu_id], mem_out, CL_FALSE, 0, outsize, host_hash, 0, NULL, &multi_profilingEvent[7]), "Copy data back"); ///Await completion of all the above BENCH_CLERROR(clFinish(queue[ocl_gpu_id]), "clFinish error"); return count; }
static int crypt_all(int *pcount, struct db_salt *salt) { int i; const int count = *pcount; int loops = (host_salt->rounds + HASH_LOOPS - 1) / HASH_LOOPS; opencl_limit_gws(count); #if 0 printf("crypt_all(%d)\n", count); printf("LWS = %d, GWS = %d, loops=%d\n",(int)local_work_size, (int)global_work_size, loops); #endif /// Copy data to gpu HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE, 0, global_work_size * sizeof(pass_t), host_pass, 0, NUUL, NULL), "Copy data to gpu"); /// Run kernel HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NUUL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run kernel"); for(i = 0; i < loops; i++) { HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], split_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run split kernel"); HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish"); opencl_process_event(); } /// Read the result back HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0, global_work_size * sizeof(crack_t), host_crack, 0, NUUL, NULL), "Copy result back"); return count; }
static gpu_mem_buffer exec_pbkdf2(cl_uint *pass_api,cl_uint *salt_api,cl_uint saltlen_api,cl_uint *hash_out_api,cl_uint num, int jtrUniqDevNo,cl_command_queue cmdq ) { cl_event evnt; size_t N = num, M = globalObj[jtrUniqDevNo].lws; cl_int err; unsigned int i, itrCntKrnl = ITERATION_COUNT_PER_CALL; cl_ulong _kernelExecTimeNs = 0; HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu, CL_TRUE, 0, 4 * num * sizeof(cl_uint), pass_api, 0, NULL, NULL ), "Copy data to gpu"); HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu, CL_TRUE, 0, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), salt_api, 0, NULL, NULL ), "Copy data to gpu"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 2, sizeof(cl_uint), &saltlen_api), "Set Kernel 0 Arg 2 :FAILED"); HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_uint), &num), "Set Kernel 0 Arg 3 :FAILED"); err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[0], 1, NULL, &N, &M, 0, NULL, &evnt); if (err) { if (PROFILE) globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2; else HANDLE_CLERROR(err, "Enque Kernel Failed"); return globalObj[jtrUniqDevNo].gpu_buffer; } if (PROFILE) { cl_ulong startTime, endTime; HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync :FAILED"); HANDLE_CLERROR(clFinish(cmdq), "clFinish error"); clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); _kernelExecTimeNs = endTime - startTime; } for (i=0; i< (10240 - 1); i = i+ itrCntKrnl ) { if (i == (10240 - itrCntKrnl)) --itrCntKrnl; HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_uint), &itrCntKrnl), "Set Kernel 1 Arg 1 :FAILED"); err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[1], 1, NULL, &N, &M, 0, NULL, &evnt); if (err) { if (PROFILE) globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2; else HANDLE_CLERROR(err, "Enque Kernel Failed"); return globalObj[jtrUniqDevNo].gpu_buffer; } opencl_process_event(); if (PROFILE) { cl_ulong startTime, endTime; HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync FAILED"); HANDLE_CLERROR(clFinish(cmdq), "clFinish error"); clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); _kernelExecTimeNs += endTime - startTime; } else if (active_dev_ctr == 1) HANDLE_CLERROR(clFinish(cmdq), "clFinish error"); } err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[2], 1, NULL, &N, &M, 0, NULL, &evnt); if (err) { if (PROFILE) globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2; else HANDLE_CLERROR(err, "Enque Kernel Failed"); return globalObj[jtrUniqDevNo].gpu_buffer; } if (PROFILE) { cl_ulong startTime, endTime; HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync :FAILED"); HANDLE_CLERROR(clFinish(cmdq), "clFinish error"); clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); _kernelExecTimeNs += endTime - startTime; if (_kernelExecTimeNs < kernelExecTimeNs) { kernelExecTimeNs = _kernelExecTimeNs; //printf("%d\n",(int)kernelExecTimeNs); globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws * 2; globalObj[jtrUniqDevNo].exec_time_inv = (long double)pow(10, 9) / (long double)kernelExecTimeNs; } } else HANDLE_CLERROR(clEnqueueReadBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu, CL_FALSE, 0, 4*num*sizeof(cl_uint), hash_out_api, 1, &evnt, &events[event_ctr++]), "Write :FAILED"); return globalObj[jtrUniqDevNo].gpu_buffer; }