Ejemplo n.º 1
0
static void init(struct fmt_main *self)
{
	cl_ulong maxsize;
	size_t selected_gws;

	opencl_init_opt("$JOHN/kernels/pwsafe_kernel.cl", ocl_gpu_id, NULL);

	init_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_INIT_NAME, &ret_code);
	HANDLE_CLERROR(ret_code, "Error while creating init kernel");

	crypt_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_RUN_NAME, &ret_code);
	HANDLE_CLERROR(ret_code, "Error while creating crypt kernel");

	finish_kernel = clCreateKernel(program[ocl_gpu_id], KERNEL_FINISH_NAME, &ret_code);
	HANDLE_CLERROR(ret_code, "Error while creating finish kernel");

	local_work_size = cpu(device_info[ocl_gpu_id]) ? 1 : 64;
	global_work_size = 0;
	opencl_get_user_preferences(CONFIG_NAME);

	//Initialize openCL tuning (library) for this format.
	opencl_init_auto_setup(STEP, ROUNDS_DEFAULT/8, 8, split_events,
		warn, &multi_profilingEvent[3], self, create_clobj, release_clobj,
		sizeof(pwsafe_pass), 0);

	self->methods.crypt_all = crypt_all_benchmark;

	selected_gws = global_work_size;
	/* Note: we ask for the kernels' max sizes, not the device's! */
	maxsize = get_current_work_group_size(ocl_gpu_id, init_kernel);
	maxsize = MIN(get_current_work_group_size(ocl_gpu_id, crypt_kernel),
	              maxsize);
	maxsize = MIN(get_current_work_group_size(ocl_gpu_id, finish_kernel),
	              maxsize);

	while (local_work_size > maxsize)
		local_work_size >>= 1;

	self->params.max_keys_per_crypt = (global_work_size ? global_work_size: MAX_KEYS_PER_CRYPT);

	if (!local_work_size) {
		create_clobj(self->params.max_keys_per_crypt, self);
		find_best_lws(self, ocl_gpu_id);
		release_clobj();
	}
	global_work_size = selected_gws;

	if (global_work_size)
		create_clobj(global_work_size, self);
	else
		//user chose to die of boredom
		find_best_gws(self, ocl_gpu_id);

	self->params.min_keys_per_crypt = local_work_size;
	self->params.max_keys_per_crypt = global_work_size;
	self->methods.crypt_all = crypt_all;

	if (options.verbosity > 2)
		fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);
}
Ejemplo n.º 2
0
static void fmt_ssha_init(struct fmt_main *self)
{
    char *temp;
    cl_ulong maxsize;

    global_work_size = 0;

    opencl_init("$JOHN/ssha_kernel.cl", ocl_gpu_id, platform_id);

    // create kernel to execute
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "sha1_crypt_kernel", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size");

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(temp);

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(temp);

    if ((temp = getenv("LWS")))
        local_work_size = atoi(temp);

    if ((temp = getenv("GWS")))
        global_work_size = atoi(temp);

    if (!local_work_size) {
        int temp = global_work_size;
        local_work_size = maxsize;
        global_work_size = global_work_size ? global_work_size : 4 * maxsize;
        create_clobj(global_work_size, self);
        opencl_find_best_workgroup_limit(self, maxsize);
        release_clobj();
        global_work_size = temp;
    }

    if (local_work_size > maxsize) {
        fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize);
        local_work_size = maxsize;
    }

    if (!global_work_size)
        find_best_gws(getenv("GWS") == NULL ? 0 : 1, self);

    if (global_work_size < local_work_size)
        global_work_size = local_work_size;

    fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);
    create_clobj(global_work_size, self);
    atexit(release_clobj);
}
Ejemplo n.º 3
0
/* ------- Initialization  ------- */
static void init(struct fmt_main * self) {
    char * tmp_value;
    char * task = "$JOHN/sha256_kernel.cl";

    opencl_init_dev(ocl_gpu_id, platform_id);
    source_in_use = device_info[ocl_gpu_id];

    if ((tmp_value = getenv("_TYPE")))
        source_in_use = atoi(tmp_value);

    opencl_build_kernel(task, ocl_gpu_id);

    // create kernel(s) to execute
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_crypt", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
    cmp_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_cmp", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel_cmp. Double-check kernel name?");

    global_work_size = get_task_max_size();
    local_work_size = 0;

    if (source_in_use != device_info[ocl_gpu_id]) {
        device_info[ocl_gpu_id] = source_in_use;
        fprintf(stderr, "Selected runtime id %d, source (%s)\n", source_in_use, task);
    }

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("LWS")))
        local_work_size = atoi(tmp_value);

    //Check if local_work_size is a valid number.
    if (local_work_size > get_task_max_work_group_size()){
        fprintf(stderr, "Error: invalid local work size (LWS). Max value allowed is: %zd\n" ,
               get_task_max_work_group_size());
        local_work_size = 0; //Force find a valid number.
    }
    self->params.max_keys_per_crypt = global_work_size;

    if (!local_work_size) {
        local_work_size = get_task_max_work_group_size();
        create_clobj(global_work_size, self);
        find_best_workgroup(self);
        release_clobj();
    }

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("GWS")))
        global_work_size = atoi(tmp_value);

    //Check if a valid multiple is used.
    global_work_size = get_multiple(global_work_size, local_work_size);

    if (global_work_size)
        create_clobj(global_work_size, self);

    else {
        //user chose to die of boredom
        global_work_size = get_task_max_size();
        find_best_gws(self);
    }
    fprintf(stderr, "Local work size (LWS) %d, global work size (GWS) %zd\n",
           (int) local_work_size, global_work_size);
    self->params.max_keys_per_crypt = global_work_size;
}
Ejemplo n.º 4
0
static void init(struct fmt_main *self)
{
#ifdef CL_VERSION_1_0
	char *temp;
	cl_ulong maxsize;

	global_work_size = 0;

	opencl_init("$JOHN/rar_kernel.cl", ocl_gpu_id, platform_id);

	// create kernel to execute
	crypt_kernel = clCreateKernel(program[ocl_gpu_id], "SetCryptKeys", &ret_code);
	HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

	/* We mimic the lengths of cRARk for comparisons */
	if (get_device_type(ocl_gpu_id) == CL_DEVICE_TYPE_GPU) {
#ifndef DEBUG
		self->params.benchmark_comment = " (6 characters)";
#endif
		self->params.tests = gpu_tests;
#if defined(DEBUG) && !defined(ALWAYS_OPENCL)
		fprintf(stderr, "Note: will use CPU for some self-tests, and Single mode.\n");
#endif
	}

	if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG)))
		local_work_size = atoi(temp);

	if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG)))
		global_work_size = atoi(temp);

	if ((temp = getenv("LWS")))
		local_work_size = atoi(temp);

	if ((temp = getenv("GWS")))
		global_work_size = atoi(temp);

	/* Note: we ask for this kernel's max size, not the device's! */
	HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size");

#ifdef DEBUG
	fprintf(stderr, "Max allowed local work size %d\n", (int)maxsize);
#endif

	if (!local_work_size) {
		if (get_device_type(ocl_gpu_id) == CL_DEVICE_TYPE_CPU) {
			if (get_platform_vendor_id(platform_id) == INTEL)
				local_work_size = 8;
			else
				local_work_size = 1;
		} else {
			local_work_size = 64;
		}
	}

	if (local_work_size > maxsize) {
		fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize);
		local_work_size = maxsize;
	}

	if (!global_work_size)
		find_best_gws(temp == NULL ? 0 : 1);

	if (global_work_size < local_work_size)
		global_work_size = local_work_size;

	fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);

	create_clobj(global_work_size);

#ifdef DEBUG
	{
		cl_ulong loc_mem_size;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(loc_mem_size), &loc_mem_size, NULL), "Query local memory usage");
		fprintf(stderr, "Kernel using %lu bytes of local memory out of %lu available\n", loc_mem_size, get_local_memory_size(ocl_gpu_id));
	}
#endif

	atexit(release_clobj);

	*mkpc = VF * global_work_size;

#endif	/* OpenCL */

#if defined (_OPENMP)
	omp_t = omp_get_max_threads();
	self->params.min_keys_per_crypt *= omp_t;
#ifndef CL_VERSION_1_0	/* OpenCL gets to decide */
	*mkpc = omp_t * OMP_SCALE * MAX_KEYS_PER_CRYPT;
#endif
	init_locks();
#endif /* _OPENMP */

	if (options.utf8)
		self->params.plaintext_length = PLAINTEXT_LENGTH * 3;

	unpack_data = mem_calloc_tiny(sizeof(unpack_data_t) * omp_t, MEM_ALIGN_WORD);
	cracked = mem_calloc_tiny(sizeof(*cracked) * *mkpc, MEM_ALIGN_WORD);
#ifndef CL_VERSION_1_0
	saved_key = mem_calloc_tiny(UNICODE_LENGTH * *mkpc, MEM_ALIGN_NONE);
	saved_len = mem_calloc_tiny(sizeof(*saved_len) * *mkpc, MEM_ALIGN_WORD);
	saved_salt = mem_calloc_tiny(8, MEM_ALIGN_NONE);
	aes_key = mem_calloc_tiny(16 * *mkpc, MEM_ALIGN_NONE);
	aes_iv = mem_calloc_tiny(16 * *mkpc, MEM_ALIGN_NONE);
#endif

	/* OpenSSL init */
	init_aesni();
	SSL_load_error_strings();
	SSL_library_init();
	OpenSSL_add_all_algorithms();
#ifndef __APPLE__
	atexit(openssl_cleanup);
#endif
	/* CRC-32 table init, do it before we start multithreading */
	{
		CRC32_t crc;
		CRC32_Init(&crc);
	}
}
Ejemplo n.º 5
0
size_t 	select_device(int jtrUniqDevNo, struct fmt_main *fmt) {
	cl_int 		err;
	const char  	*errMsg;

	opencl_init_opt("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevNo, NULL);

	globalObj[jtrUniqDevNo].krnl[0] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[1] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_iter", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_iter FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[2] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_postprocess", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_postprocess FAILED\n");
		return 0;
	}

	errMsg = "Create Buffer FAILED";

	globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu == (cl_mem)0)
		HANDLE_CLERROR(err,errMsg );
	globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_WRITE_ONLY, 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, MAX_KEYS_PER_CRYPT * sizeof(temp_buf), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);

	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 0 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu), "Set Kernel 0 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 4, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 0 Arg 4 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 1 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 2 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu), "Set Kernel 2 Arg 1 :FAILED");

	if (((!global_work_size) || ((!local_work_size) && global_work_size)) || (active_dev_ctr != 0))
		find_best_workgroup(jtrUniqDevNo);
	else {
		size_t 		maxsize, maxsize2;

		maxsize = get_kernel_preferred_work_group_size(jtrUniqDevNo, globalObj[jtrUniqDevNo].krnl[0]);
		maxsize2 = get_kernel_preferred_work_group_size(jtrUniqDevNo, globalObj[jtrUniqDevNo].krnl[1]);
		if (maxsize2 > maxsize)
			maxsize = maxsize2;

		maxsize2 = get_kernel_preferred_work_group_size(jtrUniqDevNo, globalObj[jtrUniqDevNo].krnl[2]);
		if (maxsize2 > maxsize)
			maxsize = maxsize2;

		while (local_work_size > maxsize)
			local_work_size /= 2;

	if (options.verbosity > 3)
		fprintf(stderr, "Local worksize (LWS) forced to %zu\n", local_work_size);

		globalObj[jtrUniqDevNo].lws = local_work_size;
	}

	if ((!global_work_size) || (active_dev_ctr != 0))
		find_best_gws(jtrUniqDevNo, fmt);
	else {
		if (options.verbosity > 3)
			fprintf(stderr, "Global worksize (GWS) forced to %zu\n", global_work_size);

		fmt -> params.max_keys_per_crypt = global_work_size;
		fmt -> params.min_keys_per_crypt = max_lws();
	}

	active_dev_ctr++;

	return globalObj[jtrUniqDevNo].lws;
}
Ejemplo n.º 6
0
static void init(struct fmt_main *self)
{
    char *temp;
    cl_ulong maxsize, maxsize2;
    char build_opts[64];

    global_work_size = 0;

    snprintf(build_opts, sizeof(build_opts),
             "-DHASH_LOOPS=%u -DUNICODE_LENGTH=%u %s",
             HASH_LOOPS,
             UNICODE_LENGTH,
             (options.flags & FLG_VECTORIZE) ? "-DVECTORIZE" :
             (options.flags & FLG_SCALAR) ? "-DSCALAR" : "");
    opencl_init_opt("$JOHN/office2007_kernel.cl", ocl_gpu_id, platform_id, build_opts);

    // create kernel to execute
    GenerateSHA1pwhash = clCreateKernel(program[ocl_gpu_id], "GenerateSHA1pwhash", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "HashLoop", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");
    Generate2007key = clCreateKernel(program[ocl_gpu_id], "Generate2007key", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

    if (options.flags & FLG_VECTORIZE) {
        /* Run vectorized code */
        VF = 4;
        self->params.algorithm_name = "OpenCL 4x";
    }

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(temp);

    if ((temp = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(temp);

    if ((temp = getenv("LWS")))
        local_work_size = atoi(temp);

    if ((temp = getenv("GWS")))
        global_work_size = atoi(temp);

    /* Note: we ask for the kernels' max sizes, not the device's! */
    HANDLE_CLERROR(clGetKernelWorkGroupInfo(GenerateSHA1pwhash, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Query max work group size");
    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Query max work group size");
    if (maxsize2 < maxsize) maxsize = maxsize2;
    HANDLE_CLERROR(clGetKernelWorkGroupInfo(Generate2007key, devices[ocl_gpu_id], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Query max work group size");
    if (maxsize2 < maxsize) maxsize = maxsize2;

#if 0
    /* Our use of local memory sets a limit for LWS */
    maxsize2 = get_local_memory_size(ocl_gpu_id) / (24 * VF);
    while (maxsize > maxsize2)
        maxsize >>= 1;
#endif

    /* maxsize is the lowest figure from the three different kernels */
    if (!local_work_size) {
        if (getenv("LWS")) {
            /* LWS was explicitly set to 0 */
            int temp = global_work_size;
            local_work_size = maxsize;
            global_work_size = global_work_size ? global_work_size : 4 * maxsize;
            create_clobj(global_work_size, self);
            opencl_find_best_workgroup_limit(self, maxsize);
            release_clobj();
            global_work_size = temp;
        } else {
            if (cpu(device_info[ocl_gpu_id])) {
                if (get_platform_vendor_id(platform_id) == DEV_INTEL)
                    local_work_size = MIN(maxsize, 8);
                else
                    local_work_size = 1;
            } else
                local_work_size = MIN(maxsize, 64);
        }
    }

    if (local_work_size > maxsize) {
        fprintf(stderr, "LWS %d is too large for this GPU. Max allowed is %d, using that.\n", (int)local_work_size, (int)maxsize);
        local_work_size = maxsize;
    }

    if (!global_work_size)
        find_best_gws(getenv("GWS") == NULL ? 0 : 1, self);

    if (global_work_size < local_work_size)
        global_work_size = local_work_size;

    fprintf(stderr, "Local worksize (LWS) %d, Global worksize (GWS) %d\n", (int)local_work_size, (int)global_work_size);
    create_clobj(global_work_size, self);
    atexit(release_clobj);

    if (options.utf8)
        self->params.plaintext_length = MIN(125, 3 * PLAINTEXT_LENGTH);
}
Ejemplo n.º 7
0
/* ------- Initialization  ------- */
static void init(struct fmt_main * self) {
    char * tmp_value;
    char * task = "$JOHN/cryptsha512_kernel_DEFAULT.cl";
    uint64_t startTime, runtime;

    opencl_init_dev(ocl_gpu_id, platform_id);
    startTime = (unsigned long) time(NULL);
    source_in_use = device_info[ocl_gpu_id];

    if ((tmp_value = getenv("_TYPE")))
        source_in_use = atoi(tmp_value);

    if ((tmp_value = getenv("_FAST")))
        fast_mode = TRUE;

    if (use_local(source_in_use))
            task = "$JOHN/cryptsha512_kernel_LOCAL.cl";
    else if (gpu(source_in_use)) {
        fprintf(stderr, "Building the kernel, this could take a while\n");
        task = "$JOHN/cryptsha512_kernel_GPU.cl";
    }
    fflush(stdout);
    opencl_build_kernel(task, ocl_gpu_id);

    if ((runtime = (unsigned long) (time(NULL) - startTime)) > 2UL)
        fprintf(stderr, "Elapsed time: %lu seconds\n", runtime);
    fflush(stdout);

    // create kernel(s) to execute
    crypt_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_crypt", &ret_code);
    HANDLE_CLERROR(ret_code, "Error creating kernel. Double-check kernel name?");

    if (gpu(source_in_use) || use_local(source_in_use)) {
        prepare_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_prepare", &ret_code);
        HANDLE_CLERROR(ret_code, "Error creating kernel_prepare. Double-check kernel name?");
        final_kernel = clCreateKernel(program[ocl_gpu_id], "kernel_final", &ret_code);
        HANDLE_CLERROR(ret_code, "Error creating kernel_final. Double-check kernel name?");
    }
    global_work_size = get_task_max_size();
    local_work_size = get_default_workgroup();

    if (source_in_use != device_info[ocl_gpu_id])
        fprintf(stderr, "Selected runtime id %d, source (%s)\n", source_in_use, task);

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, LWS_CONFIG)))
        local_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("LWS")))
        local_work_size = atoi(tmp_value);

    //Check if local_work_size is a valid number.
    if (local_work_size > get_task_max_work_group_size()){
        local_work_size = 0; //Force find a valid number.
    }
    self->params.max_keys_per_crypt = global_work_size;

    if (!local_work_size) {
        local_work_size = get_task_max_work_group_size();
        create_clobj(global_work_size, self);
        find_best_workgroup(self);
        release_clobj();
    }

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS,
                                   SUBSECTION_OPENCL, GWS_CONFIG)))
        global_work_size = atoi(tmp_value);

    if ((tmp_value = getenv("GWS")))
        global_work_size = atoi(tmp_value);

    //Check if a valid multiple is used.
    global_work_size = get_multiple(global_work_size, local_work_size);

    if (global_work_size)
        create_clobj(global_work_size, self);

    else {
        //user chose to die of boredom
        global_work_size = get_task_max_size();
        find_best_gws(self);
    }
    fprintf(stderr, "Local work size (LWS) %d, global work size (GWS) %zd\n",
           (int) local_work_size, global_work_size);
    self->params.max_keys_per_crypt = global_work_size;
}
size_t 	select_device(int jtrUniqDevNo, struct fmt_main *fmt) {
	cl_int 		err;
	const char  	*errMsg;
	size_t	 	memAllocSz;

	active_dev_ctr++;

	opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevNo, NULL);

	globalObj[jtrUniqDevNo].krnl[0] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_short", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess_short FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[1] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_long", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess_long FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[2] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_iter", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_iter FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[3] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_postprocess", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_postprocess FAILED\n");
		return 0;
	}

	errMsg = "Create Buffer FAILED";

	memAllocSz = 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu == (cl_mem)0)
		HANDLE_CLERROR(err,errMsg );
	globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_WRITE_ONLY, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	memAllocSz = MAX_KEYS_PER_CRYPT * sizeof(temp_buf);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	memAllocSz = 5 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);


	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 0 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu), "Set Kernel 0 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 0 Arg 3 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 1 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 1 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 2, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu), "Set Kernel 1 Arg 2 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 2 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 3 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu), "Set Kernel 3 Arg 1 :FAILED");

	if (!local_work_size)
		find_best_workgroup(jtrUniqDevNo, quick_bechmark(jtrUniqDevNo));

	else {
		size_t 		maxsize, maxsize2;

		globalObj[jtrUniqDevNo].lws = local_work_size;

		// Obey limits
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[0], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Error querying max LWS");
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[1], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[2], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[3], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;

		while (globalObj[jtrUniqDevNo].lws > maxsize)
			globalObj[jtrUniqDevNo].lws /= 2;

		if (options.verbosity > 3)
			fprintf(stderr, "Local worksize (LWS) forced to "Zu"\n", globalObj[jtrUniqDevNo].lws);

		globalObj[jtrUniqDevNo].exec_time_inv = 1;
	}

	if (!global_work_size)
		find_best_gws(jtrUniqDevNo, fmt);

	else {
		if (options.verbosity > 3)
			fprintf(stderr, "Global worksize (GWS) forced to "Zu"\n", global_work_size);

		fmt -> params.max_keys_per_crypt = global_work_size;
		fmt -> params.min_keys_per_crypt = max_lws();
	}

	return globalObj[jtrUniqDevNo].lws;
}