static int crypt_all(int *pcount, struct db_salt *salt)
{
    const int count = *pcount;
    int i;
    size_t scalar_gws;

    global_work_size = ((count + (v_width * local_work_size - 1)) / (v_width * local_work_size)) * local_work_size;
    scalar_gws = global_work_size * v_width;
#if 0
    fprintf(stderr, "%s(%d) lws "Zu" gws "Zu" sgws "Zu"\n", __FUNCTION__,
            count, local_work_size, global_work_size, scalar_gws);
#endif
    /// Copy data to gpu
    if (new_keys) {
        HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE, 0, key_buf_size, inbuffer, 0, NULL, NULL), "Copy data to gpu");
        new_keys = 0;
    }

    /// Run kernels
    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], pbkdf1_init, 1, NULL, &global_work_size, &local_work_size, 0, NULL, firstEvent), "Run initial kernel");

    for (i = 0; i < LOOP_COUNT; i++) {
        HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], pbkdf1_loop, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run loop kernel");
        HANDLE_CLERROR(clFinish(queue[gpu_id]), "Error running loop kernel");
        opencl_process_event();
    }

    HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], pbkdf1_final, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "Run intermediate kernel");

    /// Read the result back
    HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0, sizeof(pbkdf1_out) * scalar_gws, host_crack, 0, NULL, NULL), "Copy result back");

    return count;
}
static int crypt_all(int *pcount, struct db_salt *salt)
{
	const int count = *pcount;
	int i, index;
	size_t *lws = local_work_size ? &local_work_size : NULL;

	global_work_size = local_work_size ? (count + local_work_size - 1) / local_work_size * local_work_size : count;

	if (any_cracked) {
		memset(cracked, 0, cracked_size);
		any_cracked = 0;
	}

	// Copy data to gpu
	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE, 0,
		insize, inbuffer, 0, NULL, NULL),
	        "Copy data to gpu");

	// Run 1st kernel
	HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], sevenzip_init, 1,
		NULL, &global_work_size, lws, 0, NULL, NULL),
		"Run init kernel");

	// Run loop kernel
	for (i = 0; i < LOOP_COUNT; i++) {
		HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id],
			crypt_kernel, 1, NULL, &global_work_size, lws, 0,
		        NULL, NULL),
		        "Run loop kernel");
		HANDLE_CLERROR(clFinish(queue[gpu_id]),
		               "Error running loop kernel");
		opencl_process_event();
	}

	// Read the result back
	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0,
		outsize, outbuffer, 0, NULL, NULL),
	        "Copy result back");

#ifdef _OPENMP
#pragma omp parallel for
#endif
	for (index = 0; index < count; index++) {
		/* decrypt and check */
		if(sevenzip_decrypt(outbuffer[index].key, cur_salt->data) == 0)
		{
			cracked[index] = 1;
#ifdef _OPENMP
#pragma omp atomic
#endif
			any_cracked |= 1;
		}
	}
	return count;
}
static int crypt_all(int *pcount, struct db_salt *salt)
{
	int count = *pcount;
	int i = 0;

	global_work_size = (count + local_work_size - 1) / local_work_size * local_work_size;

	///Copy data to GPU memory
	if (new_keys)
		HANDLE_CLERROR(clEnqueueWriteBuffer
			(queue[ocl_gpu_id], mem_in, CL_FALSE, 0, insize, host_pass, 0, NULL,
			NULL), "Copy memin");

	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], mem_salt, CL_FALSE,
		0, saltsize, host_salt, 0, NULL, NULL), "Copy memsalt");

	HANDLE_CLERROR(clEnqueueNDRangeKernel
	    (queue[ocl_gpu_id], init_kernel, 1, NULL, &global_work_size, &local_work_size,
		0, NULL, NULL), "Set ND range");

	///Run kernel
	for(i = 0; i < 8; i++)
	{
		HANDLE_CLERROR(clEnqueueNDRangeKernel
	    		(queue[ocl_gpu_id], crypt_kernel, 1, NULL, &global_work_size, &local_work_size,
			0, NULL, NULL), "Set ND range");
		HANDLE_CLERROR(clFinish(queue[ocl_gpu_id]), "Error running loop kernel");
		opencl_process_event();
	}

	HANDLE_CLERROR(clEnqueueNDRangeKernel
	    (queue[ocl_gpu_id], finish_kernel, 1, NULL, &global_work_size, &local_work_size,
		0, NULL, NULL), "Set ND range");

	HANDLE_CLERROR(clEnqueueReadBuffer(queue[ocl_gpu_id], mem_out, CL_FALSE, 0,
		outsize, host_hash, 0, NULL, NULL),
	    "Copy data back");

	///Await completion of all the above
	HANDLE_CLERROR(clFinish(queue[ocl_gpu_id]), "clFinish error");
	new_keys = 0;

	return count;
}
static int crypt_all_benchmark(int *pcount, struct db_salt *salt)
{
	int count = *pcount;
	int i;

	global_work_size = (count + local_work_size - 1) / local_work_size * local_work_size;

	BENCH_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], mem_in, CL_FALSE,
		0, insize, host_pass, 0, NULL, &multi_profilingEvent[0]), "Copy memin");
	BENCH_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], mem_salt, CL_FALSE,
		0, saltsize, host_salt, 0, NULL, &multi_profilingEvent[1]), "Copy memsalt");

	///Run the init kernel
	BENCH_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], init_kernel, 1,
		NULL, &global_work_size, &local_work_size,
		0, NULL, &multi_profilingEvent[2]), "Set ND range");

	///Run split kernel
	for(i = 0; i < 3; i++)
	{
		BENCH_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], crypt_kernel, 1,
			NULL, &global_work_size, &local_work_size,
			0, NULL, &multi_profilingEvent[split_events[i]]), "Set ND range");  //3, 4, 5
		BENCH_CLERROR(clFinish(queue[ocl_gpu_id]), "Error running loop kernel");
		opencl_process_event();
	}

	///Run the finish kernel
	BENCH_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], finish_kernel, 1,
		NULL, &global_work_size, &local_work_size,
		0, NULL, &multi_profilingEvent[6]), "Set ND range");

	BENCH_CLERROR(clEnqueueReadBuffer(queue[ocl_gpu_id], mem_out, CL_FALSE, 0,
		outsize, host_hash, 0, NULL, &multi_profilingEvent[7]),
	    "Copy data back");

	///Await completion of all the above
	BENCH_CLERROR(clFinish(queue[ocl_gpu_id]), "clFinish error");

	return count;
}
static int crypt_all(int *pcount, struct db_salt *salt)
{
	int i;
	const int count = *pcount;
	int loops = (host_salt->rounds + HASH_LOOPS - 1) / HASH_LOOPS;

	opencl_limit_gws(count);

#if 0
	printf("crypt_all(%d)\n", count);
	printf("LWS = %d, GWS = %d, loops=%d\n",(int)local_work_size, (int)global_work_size, loops);
#endif

	/// Copy data to gpu
	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], mem_in, CL_FALSE, 0,
		global_work_size * sizeof(pass_t), host_pass, 0, NUUL,
		NULL), "Copy data to gpu");

	/// Run kernel
	HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1,
		NUUL, &global_work_size, &local_work_size, 0, NULL,
		NULL), "Run kernel");

	for(i = 0; i < loops; i++) {
		HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id],
		        split_kernel,
			1, NULL, &global_work_size, &local_work_size, 0, NULL,
			NULL), "Run split kernel");
		HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish");
		opencl_process_event();
	}
	/// Read the result back
	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_TRUE, 0,
		global_work_size * sizeof(crack_t), host_crack, 0, NUUL,
		 NULL), "Copy result back");

	return count;
}
static gpu_mem_buffer exec_pbkdf2(cl_uint *pass_api,cl_uint *salt_api,cl_uint saltlen_api,cl_uint *hash_out_api,cl_uint num, int jtrUniqDevNo,cl_command_queue cmdq )
{
	cl_event 	evnt;
	size_t 		N = num, M = globalObj[jtrUniqDevNo].lws;
	cl_int 		err;
	unsigned int 	i, itrCntKrnl = ITERATION_COUNT_PER_CALL;
	cl_ulong 	_kernelExecTimeNs = 0;

	HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu, CL_TRUE, 0, 4 * num * sizeof(cl_uint), pass_api, 0, NULL, NULL ), "Copy data to gpu");
	HANDLE_CLERROR(clEnqueueWriteBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu, CL_TRUE, 0, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), salt_api, 0, NULL, NULL ), "Copy data to gpu");

	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 2, sizeof(cl_uint), &saltlen_api), "Set Kernel 0 Arg 2 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_uint), &num), "Set Kernel 0 Arg 3 :FAILED");

	err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[0], 1, NULL, &N, &M, 0, NULL, &evnt);
	if (err) {
		if (PROFILE)
			globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2;
	  	else
			HANDLE_CLERROR(err, "Enque Kernel Failed");

		return globalObj[jtrUniqDevNo].gpu_buffer;
	}

	if (PROFILE) {

		cl_ulong 	startTime, endTime;

		HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync :FAILED");
		HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

		clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
		clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
		_kernelExecTimeNs = endTime - startTime;
	}

	for (i=0; i< (10240 - 1); i = i+ itrCntKrnl ) {
		if (i == (10240 - itrCntKrnl))
			--itrCntKrnl;

		HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_uint), &itrCntKrnl), "Set Kernel 1 Arg 1 :FAILED");

		err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[1], 1, NULL, &N, &M, 0, NULL, &evnt);
		if (err) {
			if (PROFILE)
				globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2;
			else
				HANDLE_CLERROR(err, "Enque Kernel Failed");

			return globalObj[jtrUniqDevNo].gpu_buffer;
		}

		opencl_process_event();

		if (PROFILE) {
			cl_ulong 	startTime, endTime;

			HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync FAILED");
			HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

			_kernelExecTimeNs += endTime - startTime;
		}

		else if (active_dev_ctr == 1)
			HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

	}

	err = clEnqueueNDRangeKernel(cmdq, globalObj[jtrUniqDevNo].krnl[2], 1, NULL, &N, &M, 0, NULL, &evnt);
	if (err) {
		if (PROFILE)
			globalObj[jtrUniqDevNo].lws = globalObj[jtrUniqDevNo].lws / 2;
	  	else
			HANDLE_CLERROR(err, "Enque Kernel Failed");

		return globalObj[jtrUniqDevNo].gpu_buffer;
	}

	if (PROFILE) {
			cl_ulong 	startTime, endTime;
			HANDLE_CLERROR(clWaitForEvents(1, &evnt), "Sync :FAILED");
			HANDLE_CLERROR(clFinish(cmdq), "clFinish error");

			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL);
			clGetEventProfilingInfo(evnt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);

			_kernelExecTimeNs += endTime - startTime;

			if (_kernelExecTimeNs < kernelExecTimeNs) {
				kernelExecTimeNs = _kernelExecTimeNs;

				//printf("%d\n",(int)kernelExecTimeNs);

				globalObj[jtrUniqDevNo].lws  = globalObj[jtrUniqDevNo].lws * 2;
				globalObj[jtrUniqDevNo].exec_time_inv =  (long double)pow(10, 9) / (long double)kernelExecTimeNs;
			}

         }
         else
		HANDLE_CLERROR(clEnqueueReadBuffer(cmdq, globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu, CL_FALSE, 0, 4*num*sizeof(cl_uint), hash_out_api, 1, &evnt, &events[event_ctr++]), "Write :FAILED");


	 return globalObj[jtrUniqDevNo].gpu_buffer;
}