static void find_best_workgroup(int jtrUniqDevNo) {
        size_t 		 _lws=0;
	cl_device_type 	 dTyp;
	cl_command_queue cmdq;
	cl_int 		 err;
	cl_uint 	 *dcc_hash_host
		       = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536));
	cl_uint 	 *dcc2_hash_host
		       = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536));
	cl_uint salt_api[9], length = 10;

	event_ctr = 0;

	//HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info");
	dTyp = get_device_type(jtrUniqDevNo);
	if (dTyp == CL_DEVICE_TYPE_CPU)
		globalObj[jtrUniqDevNo].lws = 1;
	else
		globalObj[jtrUniqDevNo].lws = 16;

	///Set Dummy DCC hash , unicode salt and ascii salt(username) length
	memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536));
	memset(salt_api, 0xfe, 9 * sizeof(cl_uint));

	cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err);
	HANDLE_CLERROR(err, "Error creating command queue");

	PROFILE = 1;
	kernelExecTimeNs = CL_ULONG_MAX;

	///Find best local work size
	while (1) {
		_lws = globalObj[jtrUniqDevNo].lws;
		if (dTyp == CL_DEVICE_TYPE_CPU) {
			exec_pbkdf2(dcc_hash_host, salt_api, length, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq);
			globalObj[jtrUniqDevNo].exec_time_inv = globalObj[jtrUniqDevNo].exec_time_inv / 16;
		}
		else {
			exec_pbkdf2(dcc_hash_host, salt_api, length, dcc2_hash_host, ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536), jtrUniqDevNo, cmdq);
			globalObj[jtrUniqDevNo].exec_time_inv *= ((MAX_KEYS_PER_CRYPT < 65536) ? MAX_KEYS_PER_CRYPT : 65536) / 65536;
		}

		if (globalObj[jtrUniqDevNo].lws <= _lws) break;
	}

	PROFILE = 0;

	if (options.verbosity > 2) {
		fprintf(stderr, "Optimal Work Group Size:%d\n", (int)globalObj[jtrUniqDevNo].lws);
		fprintf(stderr, "Kernel Execution Speed (Higher is better):%Lf\n", globalObj[jtrUniqDevNo].exec_time_inv);
	}

	MEM_FREE(dcc_hash_host);
	MEM_FREE(dcc2_hash_host);
	HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed");
}
static unsigned int quick_bechmark(int jtrUniqDevNo) {
        cl_device_type 	 dTyp;
	cl_command_queue cmdq;
	cl_int 		 err;
	cl_uint 	 *dcc_hash_host
		       = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * 4096);
	cl_uint 	 *dcc2_hash_host
		       = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * 4096);
	cl_uint		*hmac_sha1_out
		       = (cl_uint*)mem_alloc(5 * sizeof(cl_uint) * 4096);
	cl_uint salt_api[9], length = 10;

	event_ctr = 0;

	//HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info");
	dTyp = get_device_type(jtrUniqDevNo);
	if (dTyp == CL_DEVICE_TYPE_CPU)
		globalObj[jtrUniqDevNo].lws = 1;
	else
		globalObj[jtrUniqDevNo].lws = 64;

	///Set Dummy DCC hash , unicode salt and ascii salt(username) length
	memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * 4096);
	memset(salt_api, 0xfe, 9 * sizeof(cl_uint));

	cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err);
	HANDLE_CLERROR(err, "Error creating command queue");

	PROFILE = 1;
	kernelExecTimeNs = CL_ULONG_MAX;

	exec_pbkdf2(dcc_hash_host, salt_api, length, 2048, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq, hmac_sha1_out);

	PROFILE = 0;

	if (globalObj[jtrUniqDevNo].exec_time_inv < 15)
		return 4;
	else if (globalObj[jtrUniqDevNo].exec_time_inv < 25)
		return 2;
	else
		return 1;

	MEM_FREE(dcc_hash_host);
	MEM_FREE(dcc2_hash_host);
	MEM_FREE(hmac_sha1_out);
	HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed");
}
void pbkdf2_divide_work(cl_uint *pass_api, cl_uint *salt_api, cl_uint saltlen_api, cl_uint *hash_out_api, cl_uint num) {
	double 		total_exec_time_inv = 0;
	int 		i;
	unsigned int 	work_part, work_offset = 0, lws_max = max_lws();
	cl_int 		ret;

	event_ctr = 0;
	memset(hash_out_api, 0, num * sizeof(cl_uint));

	/// Make num multiple of lws_max
	if (num % lws_max != 0)
		num = (num / lws_max + 1) * lws_max;

	///Divide work only if number of keys is greater than 8192, else use first device selected
	if (num > 8192) {
		///Calculates t0tal Kernel Execution Speed
		for (i = 0; i < active_dev_ctr; ++i)
			total_exec_time_inv += globalObj[ocl_device_list[i]].exec_time_inv;

		///Calculate work division ratio
		for (i = 0; i < active_dev_ctr; ++i)
			globalObj[ocl_device_list[i]].exec_time_inv /= total_exec_time_inv;

		///Divide memory and work
		for (i = 0; i < active_dev_ctr; ++i) {
			if (i == active_dev_ctr - 1) {
				work_part = num - work_offset;
				if (work_part % lws_max != 0)
					work_part = (work_part / lws_max + 1) * lws_max;
			}
			else {
				work_part = num * globalObj[ocl_device_list[i]].exec_time_inv;
				if (work_part % lws_max != 0)
					work_part = (work_part / lws_max + 1) * lws_max;
			}

			if ((int)work_part <= 0)
				work_part = lws_max;

			///call to exec_pbkdf2()
#ifdef _DEBUG
			printf("Work Offset:%d  Work Part Size:%d %d\n",work_offset,work_part,event_ctr);
#endif
			exec_pbkdf2(pass_api + 4 * work_offset, salt_api, saltlen_api, hash_out_api + 4 * work_offset, work_part, ocl_device_list[i], queue[ocl_device_list[i]]);
			work_offset += work_part;
		}

		///Synchronize Device memory and Host memory
		for (i = active_dev_ctr - 1; i >= 0; --i)
			HANDLE_CLERROR(clFlush(queue[ocl_device_list[i]]), "Flush Error");

		for (i = 0; i < active_dev_ctr; ++i) {
			while (1) {
				HANDLE_CLERROR(clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL), "Error in Get Event Info");
				if ((ret) == CL_COMPLETE)
					break;
#ifdef  _DEBUG
				 printf("%d%d ", ret, i);
#endif
			}
		}

		for (i = 0; i < active_dev_ctr; ++i)
			HANDLE_CLERROR(clFinish(queue[ocl_device_list[i]]), "Finish Error");

	 }
	 else {
		exec_pbkdf2(pass_api, salt_api, saltlen_api, hash_out_api, num, ocl_device_list[0], queue[ocl_device_list[0]]);
		HANDLE_CLERROR(clFinish(queue[ocl_device_list[0]]), "Finish Error");
	}
}
void pbkdf2_divide_work(cl_uint *pass_api, cl_uint *salt_api, cl_uint saltlen_api, unsigned int iter_cnt, cl_uint *hash_out_api, cl_uint *hmac_sha1_api, cl_uint num) {
	double 		total_exec_time_inv = 0;
	int 		i;
	unsigned int 	work_part, work_offset = 0, lws_max = max_lws();
	cl_int 		ret;

#ifdef  _DEBUG
	struct timeval startc, endc;
#endif

	event_ctr = 0;
	memset(hash_out_api, 0, num * sizeof(cl_uint));

	/// Make num multiple of lws_max
	if (num % lws_max != 0)
		num = (num / lws_max + 1) * lws_max;

	///Divide work only if number of keys is greater than 8192, else use first device selected
	if (num > 8192) {
		///Calculates t0tal Kernel Execution Speed
		for (i = 0; i < active_dev_ctr; ++i)
			total_exec_time_inv += globalObj[gpu_device_list[i]].exec_time_inv;

		///Calculate work division ratio
		for (i = 0; i < active_dev_ctr; ++i)
			globalObj[gpu_device_list[i]].exec_time_inv /= total_exec_time_inv;

		///Divide memory and work
		for (i = 0; i < active_dev_ctr; ++i) {
			if (i == active_dev_ctr - 1) {
				work_part = num - work_offset;
				if (work_part % lws_max != 0)
					work_part = (work_part / lws_max + 1) * lws_max;
			}
			else {
				work_part = num * globalObj[gpu_device_list[i]].exec_time_inv;
				if (work_part % lws_max != 0)
					work_part = (work_part / lws_max + 1) * lws_max;
			}

			if ((int)work_part <= 0)
				work_part = lws_max;

#ifdef  _DEBUG
			gettimeofday(&startc, NULL) ;
			fprintf(stderr, "Work Offset:%d  Work Part Size:%d Event No:%d",work_offset,work_part,event_ctr);
#endif

			///call to exec_pbkdf2()
			exec_pbkdf2(pass_api + 4 * work_offset, salt_api, saltlen_api, iter_cnt, hash_out_api + 4 * work_offset, work_part, gpu_device_list[i], queue[gpu_device_list[i]], hmac_sha1_api + 5 * work_offset);
			work_offset += work_part;

#ifdef  _DEBUG
			gettimeofday(&endc, NULL);
			fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ;
#endif
		}

		///Synchronize all kernels
		for (i = active_dev_ctr - 1; i >= 0; --i)
			HANDLE_CLERROR(clFlush(queue[gpu_device_list[i]]), "Flush Error");

		for (i = 0; i < active_dev_ctr; ++i) {
			while (1) {
				HANDLE_CLERROR(clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL), "Error in Get Event Info");
				if ((ret) == CL_COMPLETE)
					break;
#ifdef  _DEBUG
				 printf("%d%d ", ret, i);
#endif
			}
		}

		event_ctr = work_part = work_offset = 0;

		///Read results back from all kernels
		for (i = 0; i < active_dev_ctr; ++i) {
			if (i == active_dev_ctr - 1) {
				work_part = num - work_offset;
				if (work_part % lws_max != 0)
					work_part = (work_part / lws_max + 1) * lws_max;
			}
			else {
				work_part = num * globalObj[gpu_device_list[i]].exec_time_inv;
				if (work_part % lws_max != 0)
					work_part = (work_part / lws_max + 1) * lws_max;
			}

			if ((int)work_part <= 0)
				work_part = lws_max;

#ifdef  _DEBUG
			gettimeofday(&startc, NULL) ;
			fprintf(stderr, "Work Offset:%d  Work Part Size:%d Event No:%d",work_offset,work_part,event_ctr);
#endif

			///Read results back from device
			HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_device_list[i]],
							   globalObj[gpu_device_list[i]].gpu_buffer.hash_out_gpu,
							   CL_FALSE, 0,
							   4 * work_part * sizeof(cl_uint),
							   hash_out_api + 4 * work_offset,
							   0,
							   NULL,
							   &events[event_ctr++]), "Write :FAILED");
			work_offset += work_part;

#ifdef  _DEBUG
			gettimeofday(&endc, NULL);
			fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ;
#endif
		}

		for (i = 0; i < active_dev_ctr; ++i)
			HANDLE_CLERROR(clFinish(queue[gpu_device_list[i]]), "Finish Error");

	 }

	 else {
		exec_pbkdf2(pass_api, salt_api, saltlen_api, iter_cnt, hash_out_api, num, gpu_device_list[0], queue[gpu_device_list[0]], hmac_sha1_api);
		HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_device_list[0]], globalObj[gpu_device_list[0]].gpu_buffer.hash_out_gpu, CL_FALSE, 0, 4*num*sizeof(cl_uint), hash_out_api, 0, NULL, NULL), "Write :FAILED");
		HANDLE_CLERROR(clFinish(queue[gpu_device_list[0]]), "Finish Error");
	}
}