static void init(struct fmt_main *self) {
	char *kpc;

	global_work_size = MAX_KEYS_PER_CRYPT;

	opencl_init("$JOHN/md4_kernel.cl", ocl_gpu_id, platform_id);
	crypt_kernel = clCreateKernel(program[ocl_gpu_id], "md4", &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
	if( ((kpc = getenv("LWS")) == NULL) || (atoi(kpc) == 0)) {
		create_clobj(MD4_NUM_KEYS);
		opencl_find_best_workgroup(self);
		release_clobj();
	}else {
		local_work_size = atoi(kpc);
	}
	if( (kpc = getenv("GWS")) == NULL){
		max_keys_per_crypt = MD4_NUM_KEYS;
		create_clobj(MD4_NUM_KEYS);
	} else {
		if (atoi(kpc) == 0){
			//user chose to die of boredom
			max_keys_per_crypt = MD4_NUM_KEYS;
			create_clobj(MD4_NUM_KEYS);
			find_best_kpc();
		} else {
			max_keys_per_crypt = atoi(kpc);
			create_clobj(max_keys_per_crypt);
		}
	}
	fprintf(stderr, "Local work size (LWS) %d, Global work size (GWS) %d\n",(int)local_work_size, max_keys_per_crypt);
	self->params.max_keys_per_crypt = max_keys_per_crypt;
}
static void init(struct fmt_main *self)
{
	cl_ulong maxsize;
	size_t selected_gws;

	opencl_init_opt("$JOHN/kernels/pwsafe_kernel.cl", ocl_gpu_id, NULL);

	init_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_INIT_NAME, &ret_code);
	HANDLE_CLERROR(ret_code, "Error while creating init kernel");

	crypt_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_RUN_NAME, &ret_code);
	HANDLE_CLERROR(ret_code, "Error while creating crypt kernel");

	finish_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_FINISH_NAME, &ret_code);
	HANDLE_CLERROR(ret_code, "Error while creating finish kernel");

	local_work_size = cpu(device_info[ocl_gpu_id]) ? 1 : 64;
	global_work_size = 0;
	opencl_get_user_preferences(CONFIG_NAME);

	//Initialize openCL tuning (library) for this format.
	opencl_init_auto_setup(STEP, ROUNDS_DEFAULT/8, 8, split_events,
		warn, &multi_profilingEvent[3], self, create_clobj, release_clobj,
		sizeof(pwsafe_pass), 0);

	self->methods.crypt_all = crypt_all_benchmark;

	selected_gws = global_work_size;
	/* Note: we ask for the kernels' max sizes, not the device's! */
	maxsize = get_current_work_group_size(ocl_gpu_id, init_kernel);
	maxsize = MIN(get_current_work_group_size(ocl_gpu_id, crypt_kernel),
	              maxsize);
	maxsize = MIN(get_current_work_group_size(ocl_gpu_id, finish_kernel),
	              maxsize);

	while (local_work_size > maxsize)
		local_work_size >>= 1;

	self->params.max_keys_per_crypt = (global_work_size ? global_work_size: MAX_KEYS_PER_CRYPT);

	if (!local_work_size) {
		create_clobj(self->params.max_keys_per_crypt, self);
		find_best_lws(self, ocl_gpu_id);
		release_clobj();
	}
	global_work_size = selected_gws;

	if (global_work_size)
		create_clobj(global_work_size, self);
	else
		//user chose to die of boredom
		find_best_gws(self, ocl_gpu_id);

	self->params.min_keys_per_crypt = local_work_size;
	self->params.max_keys_per_crypt = global_work_size;
	self->methods.crypt_all = crypt_all;

	if (options.verbosity > 2)
		fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);
}
static void fmt_ssha_init(struct fmt_main *pFmt)
{
	char *kpc;
	opencl_init("$JOHN/ssha_opencl_kernel.cl", gpu_id);

	// create kernel to execute
	crypt_kernel = clCreateKernel(program[gpu_id], "sha1_crypt_kernel", &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

	if( (kpc = getenv("LWS")) == NULL){
		create_clobj(SSHA_NUM_KEYS);
		find_best_workgroup();
		release_clobj();
	}else {
		local_work_size = atoi(kpc);
	}
	if( (kpc = getenv("KPC")) == NULL){
		max_keys_per_crypt = SSHA_NUM_KEYS;
		create_clobj(SSHA_NUM_KEYS);
	} else {
		if (atoi(kpc) == 0){
			//user chose to die of boredom
			max_keys_per_crypt = SSHA_NUM_KEYS;
			create_clobj(SSHA_NUM_KEYS);
			find_best_kpc();
		} else {
			max_keys_per_crypt = atoi(kpc);
			create_clobj(max_keys_per_crypt);
		}
	}
	printf("Local work size (LWS) %d, Keys per crypt (KPC) %d\n",(int)local_work_size,max_keys_per_crypt);
	pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
}
static void fmt_ssha_init(struct fmt_main *self)
{
    char *temp;
    cl_ulong maxsize;

    global_work_size = 0;

    opencl_init("$JOHN/ssha_kernel.cl", ocl_gpu_id, platform_id);

    // create kernel to execute
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "sha1_crypt_kernel", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size");

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(temp);

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(temp);

    if ((temp = getenv("LWS")))
        local_work_size = atoi(temp);

    if ((temp = getenv("GWS")))
        global_work_size = atoi(temp);

    if (!local_work_size) {
        int temp = global_work_size;
        local_work_size = maxsize;
        global_work_size = global_work_size ? global_work_size : 4 * maxsize;
        create_clobj(global_work_size, self);
        opencl_find_best_workgroup_limit(self, maxsize);
        release_clobj();
        global_work_size = temp;
    }

    if (local_work_size > maxsize) {
        fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize);
        local_work_size = maxsize;
    }

    if (!global_work_size)
        find_best_gws(getenv("GWS") == NULL ? 0 : 1, self);

    if (global_work_size < local_work_size)
        global_work_size = local_work_size;

    fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);
    create_clobj(global_work_size, self);
    atexit(release_clobj);
}
static void find_best_kpc(void){
    int num;
    cl_event myEvent;
    cl_ulong startTime, endTime, tmpTime;
    int kernelExecTimeNs = 6969;
    cl_int ret_code;
    int optimal_kpc=2048;
    int i = 0;
    cl_uint *tmpbuffer;

    printf("Calculating best keys per crypt, this will take a while ");
    for( num=SSHA_NUM_KEYS; num > 4096 ; num -= 16384){
        release_clobj();
	create_clobj(num);
	advance_cursor();
	queue_prof = clCreateCommandQueue( context[gpu_id], devices[gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
	for (i=0; i < num; i++){
		memcpy(&(saved_plain[i*PLAINTEXT_LENGTH]),"abacaeaf",PLAINTEXT_LENGTH);
	}
	clEnqueueWriteBuffer(queue_prof, mysalt, CL_TRUE, 0, SALT_SIZE, saved_salt, 0, NULL, NULL);
	clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH) * num, saved_plain, 0, NULL, NULL);
    	ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent);
	if(ret_code != CL_SUCCESS){
		printf("Error %d\n",ret_code);
		continue;
	}
	clFinish(queue_prof);
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
	tmpTime = endTime-startTime;
	tmpbuffer = malloc(sizeof(cl_uint) * num);
	clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * num, tmpbuffer, 0, NULL, &myEvent);
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
	tmpTime = tmpTime + (endTime-startTime);
	if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) {
		kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ;
		optimal_kpc = num;
	}
	free(tmpbuffer);
	clReleaseCommandQueue(queue_prof);
    }
    printf("Optimal keys per crypt %d\n(to avoid this test on next run, put \""
           KPC_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS
           SUBSECTION_OPENCL "])\n", optimal_kpc, optimal_kpc);
    max_keys_per_crypt = optimal_kpc;
    release_clobj();
    create_clobj(optimal_kpc);
}
static void find_best_kpc(void){
	int num;
	cl_event myEvent;
	cl_ulong startTime, endTime, tmpTime;
	int kernelExecTimeNs = 6969;
	cl_int ret_code;
	int optimal_kpc=2048;
	int i = 0;
	cl_uint *tmpbuffer;

	fprintf(stderr, "Calculating best keys per crypt, this will take a while ");
	for( num=MD4_NUM_KEYS; num > 4096 ; num -= 4096){
		release_clobj();
		create_clobj(num);
		advance_cursor();
		queue_prof = clCreateCommandQueue( context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
		for (i=0; i < num; i++){
			memcpy(&(saved_plain[i * (PLAINTEXT_LENGTH + 1)]), "abcaaeaf", PLAINTEXT_LENGTH + 1);
			saved_plain[i * (PLAINTEXT_LENGTH + 1) + 8] = 0x80;
		}
        	clEnqueueWriteBuffer(queue_prof, data_info, CL_TRUE, 0, sizeof(unsigned int)*2, datai, 0, NULL, NULL);
		clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH + 1) * num, saved_plain, 0, NULL, NULL);
    		ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent);
		if(ret_code != CL_SUCCESS) {
			HANDLE_CLERROR(ret_code, "Error running kernel in find_best_KPC()");
			continue;
		}
		clFinish(queue_prof);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
		tmpTime = endTime-startTime;
		tmpbuffer = malloc(sizeof(cl_uint) * num);
		clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * num, tmpbuffer, 0, NULL, &myEvent);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
		tmpTime = tmpTime + (endTime-startTime);
		if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) {
			kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ;
			optimal_kpc = num;
		}
		free(tmpbuffer);
		clReleaseCommandQueue(queue_prof);
	}
	fprintf(stderr, "Optimal keys per crypt %d\n(to avoid this test on next run do \"export GWS=%d\")\n",optimal_kpc,optimal_kpc);
	max_keys_per_crypt = optimal_kpc;
	release_clobj();
	create_clobj(optimal_kpc);
}
static void find_best_kpc(void){
	int num;
	cl_event myEvent;
	cl_ulong startTime, endTime, tmpTime;
	int kernelExecTimeNs = INT_MAX;
	cl_int ret_code;
	int optimal_kpc=2048;
	int i = 0;
	cl_uint *tmpbuffer;

	fprintf(stderr, "Calculating best keys per crypt, this will take a while ");
	for( num=MAX_KEYS_PER_CRYPT; num >= 4096 ; num -= 4096){
		release_clobj();
		create_clobj(num);
		advance_cursor();
		queue_prof = clCreateCommandQueue( context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
		for (i=0; i < num; i++){
			strcpy(&(saved_plain[i * keybuf_size]), tests[0].plaintext);
		}
		clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, keybuf_size * num, saved_plain, 0, NULL, NULL);
		ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent);
		if(ret_code != CL_SUCCESS){
			fprintf(stderr, "Error %d\n",ret_code);
			continue;
		}
		clFinish(queue_prof);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
		tmpTime = endTime-startTime;
		tmpbuffer = mem_alloc(sizeof(cl_uint) * num);
		clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * num, tmpbuffer, 0, NULL, &myEvent);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
		tmpTime = tmpTime + (endTime-startTime);
		if( ((int)( ((float) (tmpTime) / num) * 10 )) <= kernelExecTimeNs) {
			kernelExecTimeNs = ((int) (((float) (tmpTime) / num) * 10) ) ;
			optimal_kpc = num;
		}
		MEM_FREE(tmpbuffer);
		clReleaseCommandQueue(queue_prof);
	}
	fprintf(stderr, "Optimal keys per crypt %d\n(to avoid this test on next run do export GWS=%d)\n",optimal_kpc,optimal_kpc);
	global_work_size = optimal_kpc;
	release_clobj();
	create_clobj(optimal_kpc);
}
Exemple #8
0
void sha1_init(size_t user_kpc)
{
	kpc = user_kpc;
	load_source();
	createDevice();
	createkernel();
	create_clobj();
}
/* --
  This function could be used to calculated the best num
  of keys per crypt for the given format
-- */
static void find_best_gws(struct fmt_main * self, int sequential_id) {

	//Call the common function.
	common_find_best_gws(
		sequential_id, ROUNDS_DEFAULT, 0,
		(cpu(device_info[ocl_gpu_id]) ? 500000000ULL : 2400000000ULL)
	);

	create_clobj(global_work_size, self);
}
static void fmt_ssha_init(struct fmt_main *pFmt)
{
	char *temp;
	opencl_init("$JOHN/ssha_kernel.cl", gpu_id, platform_id);

	// create kernel to execute
	crypt_kernel = clCreateKernel(program[gpu_id], "sha1_crypt_kernel", &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

	if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
	                          LWS_CONFIG)))
		local_work_size = atoi(temp);

	if ((temp = getenv("LWS")))
		local_work_size = atoi(temp);

	if (!local_work_size) {
		create_clobj(SSHA_NUM_KEYS);
		find_best_workgroup();
		release_clobj();
	}

	if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
	                          KPC_CONFIG)))
		max_keys_per_crypt = atoi(temp);
	else
		max_keys_per_crypt = SSHA_NUM_KEYS;

	if ((temp = getenv("KPC")))
		max_keys_per_crypt = atoi(temp);

	if (max_keys_per_crypt) {
		create_clobj(max_keys_per_crypt);
	} else {
		//user chose to die of boredom
		max_keys_per_crypt = SSHA_NUM_KEYS;
		create_clobj(SSHA_NUM_KEYS);
		find_best_kpc();
	}
	printf("Local work size (LWS) %d, Keys per crypt (KPC) %d\n",(int)local_work_size,max_keys_per_crypt);
	pFmt->params.max_keys_per_crypt = max_keys_per_crypt;
}
Exemple #11
0
static void find_best_gws(int do_benchmark, struct fmt_main *self)
{
    cl_event myEvent;
    cl_ulong startTime, endTime, tmpTime;
    int kernelExecTimeNs = INT_MAX;
    cl_int ret_code;
    int optimal_kpc=2048;
    int gws, i = 0;
    cl_uint *tmpbuffer;

    for(gws = local_work_size << 2; gws <= 8*1024*1024; gws <<= 1) {
        create_clobj(gws, self);
        advance_cursor();
        queue_prof = clCreateCommandQueue( context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
        for (i=0; i < gws; i++) {
            memcpy(&(saved_plain[i*PLAINTEXT_LENGTH]),"abacaeaf",PLAINTEXT_LENGTH);
        }
        clEnqueueWriteBuffer(queue_prof, mysalt, CL_TRUE, 0, SALT_SIZE, saved_salt, 0, NULL, NULL);
        clEnqueueWriteBuffer(queue_prof, buffer_keys, CL_TRUE, 0, (PLAINTEXT_LENGTH) * gws, saved_plain, 0, NULL, NULL);
        ret_code = clEnqueueNDRangeKernel( queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent);
        if(ret_code != CL_SUCCESS) {
            // We hit some resource limit so we end here.
            release_clobj();
            break;
        }
        clFinish(queue_prof);
        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
        tmpTime = endTime-startTime;
        tmpbuffer = malloc(sizeof(cl_uint) * gws);
        clEnqueueReadBuffer(queue_prof, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * gws, tmpbuffer, 0, NULL, &myEvent);
        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
        clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END  , sizeof(cl_ulong), &endTime  , NULL);
        tmpTime = tmpTime + (endTime-startTime);
        if (do_benchmark)
            fprintf(stderr, "%10d %10llu c/s %-.2f us\n", gws, gws * 1000000000ULL / tmpTime, tmpTime / 1000.0);
        if( ((int)( ((float) (tmpTime) / gws) * 10 )) <= kernelExecTimeNs) {
            kernelExecTimeNs = ((int) (((float) (tmpTime) / gws) * 10) ) ;
            optimal_kpc = gws;
        }
        MEM_FREE(tmpbuffer);
        clReleaseCommandQueue(queue_prof);
        release_clobj();
    }
    global_work_size = optimal_kpc;
}
static void init(struct fmt_main *self)
{
	cl_ulong maxsize;

	opencl_init("$JOHN/kernels/pbkdf2_hmac_sha512_unsplit_kernel.cl", ocl_gpu_id);
	
	set_lws_gws(DEFAULT_LWS,DEFAULT_GWS);
	
	crypt_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_NAME, &cl_error);
	HANDLE_CLERROR(cl_error, "Error creating kernel");

	create_clobj(global_work_size, self);

	/* Note: we ask for the kernels' max sizes, not the device's! */
	HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel,
		devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE,
		sizeof(maxsize), &maxsize, NULL), "Query max workgroup size");
	while (local_work_size > maxsize)
		local_work_size >>= 1;

	self->params.min_keys_per_crypt = local_work_size;
	self->params.max_keys_per_crypt = global_work_size;
}
cl_ulong gws_test(int gws)
{
	cl_ulong startTime, endTime, run_time;
	cl_command_queue queue_prof;
	cl_event myEvent;
	cl_int ret_code;
	int i;
	int num = VF * gws;

	create_clobj(gws);
	queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
	for (i = 0; i < num; i++)
		set_key(rar_fmt.params.tests[0].plaintext, i);
	HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_salt, BLOCK_IF_DEBUG, 0, 8, saved_salt, 0, NULL, NULL), "Failed transferring salt");
	HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_key, BLOCK_IF_DEBUG, 0, UNICODE_LENGTH * num, saved_key, 0, NULL, NULL), "Failed transferring keys");
	HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_len, BLOCK_IF_DEBUG, 0, sizeof(int) * num, saved_len, 0, NULL, NULL), "Failed transferring lengths");
	ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &myEvent);
	if (ret_code != CL_SUCCESS) {
		fprintf(stderr, "Error: %s\n", get_error_name(ret_code));
		clReleaseCommandQueue(queue_prof);
		release_clobj();
		return 0;
	}
	HANDLE_CLERROR(clFinish(queue_prof), "Failed running kernel");
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
	run_time = endTime - startTime;
	HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_aes_iv, BLOCK_IF_DEBUG, 0, 16 * num, aes_iv, 0, NULL, &myEvent), "Failed reading iv back");
	HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_aes_key, BLOCK_IF_DEBUG, 0, 16 * num, aes_key, 0, NULL, &myEvent), "Failed reading key back");
	HANDLE_CLERROR(clFinish(queue_prof), "Failed reading results back");
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
	clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
	clReleaseCommandQueue(queue_prof);
	release_clobj();

	return (run_time + endTime - startTime);
}
/* ------- Initialization  ------- */
static void init(struct fmt_main * self) {
    char * tmp_value;
    char * task = "$JOHN/sha256_kernel.cl";

    opencl_init_dev(ocl_gpu_id, platform_id);
    source_in_use = device_info[ocl_gpu_id];

    if ((tmp_value = getenv("_TYPE")))
        source_in_use = atoi(tmp_value);

    opencl_build_kernel(task, ocl_gpu_id);

    // create kernel(s) to execute
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_crypt", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
    cmp_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_cmp", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel_cmp. Double-check kernel name?");

    global_work_size = get_task_max_size();
    local_work_size = 0;

    if (source_in_use != device_info[ocl_gpu_id]) {
        device_info[ocl_gpu_id] = source_in_use;
        fprintf(stderr, "Selected runtime id %d, source (%s)\n", source_in_use, task);
    }

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("LWS")))
        local_work_size = atoi(tmp_value);

    //Check if local_work_size is a valid number.
    if (local_work_size > get_task_max_work_group_size()){
        fprintf(stderr, "Error: invalid local work size (LWS). Max value allowed is: %zd\n" ,
               get_task_max_work_group_size());
        local_work_size = 0; //Force find a valid number.
    }
    self->params.max_keys_per_crypt = global_work_size;

    if (!local_work_size) {
        local_work_size = get_task_max_work_group_size();
        create_clobj(global_work_size, self);
        find_best_workgroup(self);
        release_clobj();
    }

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("GWS")))
        global_work_size = atoi(tmp_value);

    //Check if a valid multiple is used.
    global_work_size = get_multiple(global_work_size, local_work_size);

    if (global_work_size)
        create_clobj(global_work_size, self);

    else {
        //user chose to die of boredom
        global_work_size = get_task_max_size();
        find_best_gws(self);
    }
    fprintf(stderr, "Local work size (LWS) %d, global work size (GWS) %zd\n",
           (int) local_work_size, global_work_size);
    self->params.max_keys_per_crypt = global_work_size;
}
/* --
  This function could be used to calculated the best num
  of keys per crypt for the given format
-- */
static void find_best_gws(struct fmt_main * self) {
    size_t num = 0;
    cl_ulong run_time, min_time = CL_ULONG_MAX;

    int optimal_gws = local_work_size, step = STEP;
    int do_benchmark = 0;
    unsigned int SHAspeed, bestSHAspeed = 0;
    unsigned long long int max_run_time = 1000000000ULL;
    char *tmp_value;

    if ((tmp_value = getenv("STEP"))){
        step = atoi(tmp_value);
        do_benchmark = 1;
    }
    step = get_multiple(step, local_work_size);

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, DUR_CONFIG)))
        max_run_time = atoi(tmp_value) * 1000000000UL;

    fprintf(stderr, "Calculating best global work size (GWS) for LWS=%zd and max. %llu s duration.\n\n",
            local_work_size, max_run_time / 1000000000ULL);

    if (do_benchmark)
        fprintf(stderr, "Raw speed figures including buffer transfers:\n");

    for (num = get_step(num, step, 1); num; num = get_step(num, step, 0)) {
        //Check if hardware can handle the size we are going to try now.
        if (sizeof(sha256_password) * num * 1.2 > get_max_mem_alloc_size(ocl_gpu_id))
            break;

	if (! (run_time = gws_test(num, self)))
            continue;

        if (!do_benchmark)
            advance_cursor();

        SHAspeed = num / (run_time / 1000000000.);

        if (run_time < min_time)
            min_time = run_time;

        if (do_benchmark) {
            fprintf(stderr, "gws: %8zu\t%12lu c/s %8.3f ms per crypt_all()",
                    num, (long) (num / (run_time / 1000000000.)),
                    (float) run_time / 1000000.);

            if (run_time > max_run_time) {
                fprintf(stderr, " - too slow\n");
                break;
            }
        } else {
            if (run_time > min_time * 20 || run_time > max_run_time)
                break;
        }
        if (((long) SHAspeed - bestSHAspeed) > 10000) {
            if (do_benchmark)
                fprintf(stderr, "+");
            bestSHAspeed = SHAspeed;
            optimal_gws = num;
        }
        if (do_benchmark)
            fprintf(stderr, "\n");
    }
    fprintf(stderr, "Optimal global work size %d\n", optimal_gws);
    fprintf(stderr, "(to avoid this test on next run, put \""
        GWS_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS
        SUBSECTION_OPENCL "])\n", optimal_gws);
    global_work_size = optimal_gws;
    create_clobj(optimal_gws, self);
}
//Do the proper test using different sizes.
static cl_ulong gws_test(size_t num, struct fmt_main * self) {

    cl_event myEvent;
    cl_int ret_code;
    cl_uint *tmpbuffer;
    cl_ulong startTime, endTime, runtime;
    int i;

    //Prepare buffers.
    create_clobj(num, self);

    tmpbuffer = mem_alloc(sizeof(sha256_hash) * num);

    if (tmpbuffer == NULL) {
        fprintf(stderr, "Malloc failure in find_best_gws\n");
        exit(EXIT_FAILURE);
    }

    queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id],
            CL_QUEUE_PROFILING_ENABLE, &ret_code);
    HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue");

    // Set keys
    for (i = 0; i < num; i++) {
        set_key("aaabaabaaa", i);
    }
    //** Get execution time **//
    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0,
            sizeof(sha256_password) * num, plaintext, 0, NULL, &myEvent),
            "Failed in clEnqueueWriteBuffer");

    HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
    HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
    runtime = endTime - startTime;

    //** Get execution time **//
    ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel,
            1, NULL, &num, &local_work_size, 0, NULL, &myEvent);

    HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
    HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
    runtime += endTime - startTime;

    //** Get execution time **//
    HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0,
            sizeof(uint32_t) * num, tmpbuffer, 0, NULL, &myEvent),
            "Failed in clEnqueueReadBuffer");

    HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
            sizeof(cl_ulong), &startTime, NULL),
            "Failed in clGetEventProfilingInfo I");
    HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong), &endTime, NULL),
            "Failed in clGetEventProfilingInfo II");
    HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
    runtime += endTime - startTime;

    MEM_FREE(tmpbuffer);
    HANDLE_CLERROR(clReleaseCommandQueue(queue_prof),
            "Failed in clReleaseCommandQueue");
    release_clobj();

     if (ret_code != CL_SUCCESS) {

        if (ret_code != CL_INVALID_WORK_GROUP_SIZE)
            fprintf(stderr, "Error %d\n", ret_code);
        return 0;
    }
    return runtime;
}
static void init(struct fmt_main *self)
{
#ifdef CL_VERSION_1_0
	char *temp;
	cl_ulong maxsize;

	global_work_size = 0;

	opencl_init("$JOHN/rar_kernel.cl", ocl_gpu_id, platform_id);

	// create kernel to execute
	crypt_kernel = clCreateKernel(program[ocl_gpu_id], "SetCryptKeys", &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

	/* We mimic the lengths of cRARk for comparisons */
	if (get_device_type(ocl_gpu_id) == CL_DEVICE_TYPE_GPU) {
#ifndef DEBUG
		self->params.benchmark_comment = " (6 characters)";
#endif
		self->params.tests = gpu_tests;
#if defined(DEBUG) && !defined(ALWAYS_OPENCL)
		fprintf(stderr, "Note: will use CPU for some self-tests, and Single mode.\n");
#endif
	}

	if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG)))
		local_work_size = atoi(temp);

	if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG)))
		global_work_size = atoi(temp);

	if ((temp = getenv("LWS")))
		local_work_size = atoi(temp);

	if ((temp = getenv("GWS")))
		global_work_size = atoi(temp);

	/* Note: we ask for this kernel's max size, not the device's! */
	HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size");

#ifdef DEBUG
	fprintf(stderr, "Max allowed local work size %d\n", (int)maxsize);
#endif

	if (!local_work_size) {
		if (get_device_type(ocl_gpu_id) == CL_DEVICE_TYPE_CPU) {
			if (get_platform_vendor_id(platform_id) == INTEL)
				local_work_size = 8;
			else
				local_work_size = 1;
		} else {
			local_work_size = 64;
		}
	}

	if (local_work_size > maxsize) {
		fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize);
		local_work_size = maxsize;
	}

	if (!global_work_size)
		find_best_gws(temp == NULL ? 0 : 1);

	if (global_work_size < local_work_size)
		global_work_size = local_work_size;

	fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);

	create_clobj(global_work_size);

#ifdef DEBUG
	{
		cl_ulong loc_mem_size;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(loc_mem_size), &loc_mem_size, NULL), "Query local memory usage");
		fprintf(stderr, "Kernel using %lu bytes of local memory out of %lu available\n", loc_mem_size, get_local_memory_size(ocl_gpu_id));
	}
#endif

	atexit(release_clobj);

	*mkpc = VF * global_work_size;

#endif	/* OpenCL */

#if defined (_OPENMP)
	omp_t = omp_get_max_threads();
	self->params.min_keys_per_crypt *= omp_t;
#ifndef CL_VERSION_1_0	/* OpenCL gets to decide */
	*mkpc = omp_t * OMP_SCALE * MAX_KEYS_PER_CRYPT;
#endif
	init_locks();
#endif /* _OPENMP */

	if (options.utf8)
		self->params.plaintext_length = PLAINTEXT_LENGTH * 3;

	unpack_data = mem_calloc_tiny(sizeof(unpack_data_t) * omp_t, MEM_ALIGN_WORD);
	cracked = mem_calloc_tiny(sizeof(*cracked) * *mkpc, MEM_ALIGN_WORD);
#ifndef CL_VERSION_1_0
	saved_key = mem_calloc_tiny(UNICODE_LENGTH * *mkpc, MEM_ALIGN_NONE);
	saved_len = mem_calloc_tiny(sizeof(*saved_len) * *mkpc, MEM_ALIGN_WORD);
	saved_salt = mem_calloc_tiny(8, MEM_ALIGN_NONE);
	aes_key = mem_calloc_tiny(16 * *mkpc, MEM_ALIGN_NONE);
	aes_iv = mem_calloc_tiny(16 * *mkpc, MEM_ALIGN_NONE);
#endif

	/* OpenSSL init */
	init_aesni();
	SSL_load_error_strings();
	SSL_library_init();
	OpenSSL_add_all_algorithms();
#ifndef __APPLE__
	atexit(openssl_cleanup);
#endif
	/* CRC-32 table init, do it before we start multithreading */
	{
		CRC32_t crc;
		CRC32_Init(&crc);
	}
}
/* ------- Initialization  ------- */
static void init(struct fmt_main * self) {
    char * tmp_value;
    char * task = "$JOHN/cryptsha512_kernel_DEFAULT.cl";
    uint64_t startTime, runtime;

    opencl_init_dev(ocl_gpu_id, platform_id);
    startTime = (unsigned long) time(NULL);
    source_in_use = device_info[ocl_gpu_id];

    if ((tmp_value = getenv("_TYPE")))
        source_in_use = atoi(tmp_value);

    if ((tmp_value = getenv("_FAST")))
        fast_mode = TRUE;

    if (use_local(source_in_use))
            task = "$JOHN/cryptsha512_kernel_LOCAL.cl";
    else if (gpu(source_in_use)) {
        fprintf(stderr, "Building the kernel, this could take a while\n");
        task = "$JOHN/cryptsha512_kernel_GPU.cl";
    }
    fflush(stdout);
    opencl_build_kernel(task, ocl_gpu_id);

    if ((runtime = (unsigned long) (time(NULL) - startTime)) > 2UL)
        fprintf(stderr, "Elapsed time: %lu seconds\n", runtime);
    fflush(stdout);

    // create kernel(s) to execute
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_crypt", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

    if (gpu(source_in_use) || use_local(source_in_use)) {
        prepare_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_prepare", &ret_code);
        HANDLE_CLERROR(ret_code, "Error creating kernel_prepare. Double-check kernel name?");
        final_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_final", &ret_code);
        HANDLE_CLERROR(ret_code, "Error creating kernel_final. Double-check kernel name?");
    }
    global_work_size = get_task_max_size();
    local_work_size = get_default_workgroup();

    if (source_in_use != device_info[ocl_gpu_id])
        fprintf(stderr, "Selected runtime id %d, source (%s)\n", source_in_use, task);

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("LWS")))
        local_work_size = atoi(tmp_value);

    //Check if local_work_size is a valid number.
    if (local_work_size > get_task_max_work_group_size()){
        local_work_size = 0; //Force find a valid number.
    }
    self->params.max_keys_per_crypt = global_work_size;

    if (!local_work_size) {
        local_work_size = get_task_max_work_group_size();
        create_clobj(global_work_size, self);
        find_best_workgroup(self);
        release_clobj();
    }

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("GWS")))
        global_work_size = atoi(tmp_value);

    //Check if a valid multiple is used.
    global_work_size = get_multiple(global_work_size, local_work_size);

    if (global_work_size)
        create_clobj(global_work_size, self);

    else {
        //user chose to die of boredom
        global_work_size = get_task_max_size();
        find_best_gws(self);
    }
    fprintf(stderr, "Local work size (LWS) %d, global work size (GWS) %zd\n",
           (int) local_work_size, global_work_size);
    self->params.max_keys_per_crypt = global_work_size;
}
static void init(struct fmt_main *self)
{
    char *temp;
    cl_ulong maxsize, maxsize2;
    char build_opts[64];

    global_work_size = 0;

    snprintf(build_opts, sizeof(build_opts),
             "-DHASH_LOOPS=%u -DUNICODE_LENGTH=%u %s",
             HASH_LOOPS,
             UNICODE_LENGTH,
             (options.flags & FLG_VECTORIZE) ? "-DVECTORIZE" :
             (options.flags & FLG_SCALAR) ? "-DSCALAR" : "");
    opencl_init_opt("$JOHN/office2007_kernel.cl", ocl_gpu_id, platform_id, build_opts);

    // create kernel to execute
    GenerateSHA1pwhash = clCreateKernel(program[ocl_gpu_id], "GenerateSHA1pwhash", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "HashLoop", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
    Generate2007key = clCreateKernel(program[ocl_gpu_id], "Generate2007key", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

    if (options.flags & FLG_VECTORIZE) {
        /* Run vectorized code */
        VF = 4;
        self->params.algorithm_name = "OpenCL 4x";
    }

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(temp);

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(temp);

    if ((temp = getenv("LWS")))
        local_work_size = atoi(temp);

    if ((temp = getenv("GWS")))
        global_work_size = atoi(temp);

    /* Note: we ask for the kernels' max sizes, not the device's! */
    HANDLE_CLERROR(clGetKernelWorkGroupInfo(GenerateSHA1pwhash, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size");
    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Query max work group size");
    if (maxsize2 < maxsize) maxsize = maxsize2;
    HANDLE_CLERROR(clGetKernelWorkGroupInfo(Generate2007key, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Query max work group size");
    if (maxsize2 < maxsize) maxsize = maxsize2;

#if 0
    /* Our use of local memory sets a limit for LWS */
    maxsize2 = get_local_memory_size(ocl_gpu_id) / (24 * VF);
    while (maxsize > maxsize2)
        maxsize >>= 1;
#endif

    /* maxsize is the lowest figure from the three different kernels */
    if (!local_work_size) {
        if (getenv("LWS")) {
            /* LWS was explicitly set to 0 */
            int temp = global_work_size;
            local_work_size = maxsize;
            global_work_size = global_work_size ? global_work_size : 4 * maxsize;
            create_clobj(global_work_size, self);
            opencl_find_best_workgroup_limit(self, maxsize);
            release_clobj();
            global_work_size = temp;
        } else {
            if (cpu(device_info[ocl_gpu_id])) {
                if (get_platform_vendor_id(platform_id) == DEV_INTEL)
                    local_work_size = MIN(maxsize, 8);
                else
                    local_work_size = 1;
            } else
                local_work_size = MIN(maxsize, 64);
        }
    }

    if (local_work_size > maxsize) {
        fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize);
        local_work_size = maxsize;
    }

    if (!global_work_size)
        find_best_gws(getenv("GWS") == NULL ? 0 : 1, self);

    if (global_work_size < local_work_size)
        global_work_size = local_work_size;

    fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);
    create_clobj(global_work_size, self);
    atexit(release_clobj);

    if (options.utf8)
        self->params.plaintext_length = MIN(125, 3 * PLAINTEXT_LENGTH);
}
static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
{
    cl_ulong startTime, endTime;
    cl_command_queue queue_prof;
    cl_event Event[6];
    cl_int ret_code;
    int i;
    size_t scalar_gws = VF * gws;

    create_clobj(gws, self);
    queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
    for (i = 0; i < scalar_gws; i++)
        set_key(tests[0].plaintext, i);
    set_salt(get_salt(tests[0].ciphertext));

    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_key, CL_TRUE, 0, UNICODE_LENGTH * scalar_gws, saved_key, 0, NULL, &Event[0]), "Failed transferring keys");
    HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, cl_saved_len, CL_TRUE, 0, sizeof(int) * scalar_gws, saved_len, 0, NULL, &Event[1]), "Failed transferring lengths");

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, GenerateSHA1pwhash, 1, NULL, &scalar_gws, &local_work_size, 0, NULL, &Event[2]), "running kernel");

    for (i = 0; i < 50000 / HASH_LOOPS - 1; i++)
        HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "running kernel");

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &Event[3]), "running kernel");

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, Generate2007key, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &Event[4]), "running kernel");

    HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, cl_key, CL_TRUE, 0, 16 * scalar_gws, key, 0, NULL, &Event[5]), "failed in reading key back");

#if 0
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[2],
                                           CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[2],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    fprintf(stderr, "GenerateSHA1pwhash kernel duration: %llu us, ", (endTime-startTime)/1000ULL);
#endif

    HANDLE_CLERROR(clGetEventProfilingInfo(Event[3],
                                           CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[3],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    if (do_benchmark)
        fprintf(stderr, "%.2f ms x %u = %.2f s\t", (float)((endTime - startTime)/1000000.), 50000/HASH_LOOPS, (float)(50000/HASH_LOOPS) * (endTime - startTime) / 1000000000.);

    /* 200 ms duration limit for GCN to avoid ASIC hangs */
    if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) {
        if (do_benchmark)
            fprintf(stderr, "- exceeds 200 ms\n");
        clReleaseCommandQueue(queue_prof);
        release_clobj();
        return 0;
    }

#if 0
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[4],
                                           CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[4],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    fprintf(stderr, "Generate2007key kernel duration: %llu us\n", (endTime-startTime)/1000ULL);
#endif

    HANDLE_CLERROR(clGetEventProfilingInfo(Event[0],
                                           CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime,
                                           NULL), "Failed to get profiling info");
    HANDLE_CLERROR(clGetEventProfilingInfo(Event[5],
                                           CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
                                           NULL), "Failed to get profiling info");
    clReleaseCommandQueue(queue_prof);
    release_clobj();

    return (endTime - startTime);
}