static void find_best_gws(int jtrUniqDevNo, struct fmt_main *fmt) {

	long int 		gds_size, device_gds_size;
	static long double 	total_exec_time_inv;

	device_gds_size = (long int)globalObj[jtrUniqDevNo].exec_time_inv * 163840;
	if (device_gds_size * sizeof(temp_buf) > get_max_mem_alloc_size(jtrUniqDevNo)) {
		device_gds_size = ((get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf)) / 8192) * 8192;
		gds_size = (long int)(total_exec_time_inv * 163840) + device_gds_size;
		gds_size = (gds_size / 8192 - 1 ) * 8192;
		total_exec_time_inv +=  globalObj[jtrUniqDevNo].exec_time_inv;
	}

	else {
		total_exec_time_inv +=  globalObj[jtrUniqDevNo].exec_time_inv;
		gds_size = (long int)(total_exec_time_inv * 163840);
		gds_size = (gds_size / 8192 + 1 ) * 8192;
	}

	gds_size = (gds_size < (MAX_KEYS_PER_CRYPT - 8192)) ? gds_size : (MAX_KEYS_PER_CRYPT - 8192);
	gds_size = (gds_size > 8192) ? gds_size : 8192;

	if (options.verbosity > 2)
		fprintf(stderr, "Optimal Global Work Size:%ld\n", gds_size);

	fmt -> params.max_keys_per_crypt = gds_size;
	fmt -> params.min_keys_per_crypt = max_lws();
}
예제 #2
0
static void find_best_gws(int do_benchmark, struct fmt_main *self)
{
    int num;
    cl_ulong run_time, min_time = CL_ULONG_MAX;
    unsigned int SHAspeed, bestSHAspeed = 0, max_gws;
    int optimal_gws = local_work_size;
    const int sha1perkey = 50004;
    unsigned long long int MaxRunTime = 5000000000ULL;

    max_gws = get_max_mem_alloc_size(ocl_gpu_id) / (UNICODE_LENGTH * VF);

    if (do_benchmark) {
        fprintf(stderr, "Calculating best keys per crypt (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, MaxRunTime / 1000000000UL);
        fprintf(stderr, "Raw GPU speed figures including buffer transfers:\n");
    }

    for (num = local_work_size; max_gws; num *= 2) {
        if (!do_benchmark)
            advance_cursor();
        if (!(run_time = gws_test(num, do_benchmark, self)))
            break;

        SHAspeed = sha1perkey * (1000000000UL * VF * num / run_time);

        if (run_time < min_time)
            min_time = run_time;

        if (do_benchmark)
            fprintf(stderr, "gws %6d%8llu c/s%14u sha1/s%8.3f sec per crypt_all()", num, (1000000000ULL * VF * num / run_time), SHAspeed, (float)run_time / 1000000000.);

        if (((float)run_time / (float)min_time) < ((float)SHAspeed / (float)bestSHAspeed)) {
            if (do_benchmark)
                fprintf(stderr, "!\n");
            bestSHAspeed = SHAspeed;
            optimal_gws = num;
        } else {
            if (run_time < MaxRunTime && SHAspeed > (bestSHAspeed * 1.01)) {
                if (do_benchmark)
                    fprintf(stderr, "+\n");
                bestSHAspeed = SHAspeed;
                optimal_gws = num;
                continue;
            }
            if (do_benchmark)
                fprintf(stderr, "\n");
            if (run_time >= MaxRunTime)
                break;
        }
    }
    global_work_size = optimal_gws;
}
예제 #3
0
/* --
  This function could be used to calculated the best num
  of keys per crypt for the given format
-- */
static void find_best_gws(struct fmt_main * self) {
    size_t num = 0;
    cl_ulong run_time, min_time = CL_ULONG_MAX;

    int optimal_gws = local_work_size, step = STEP;
    int do_benchmark = 0;
    unsigned int SHAspeed, bestSHAspeed = 0;
    unsigned long long int max_run_time = 1000000000ULL;
    char *tmp_value;

    if ((tmp_value = getenv("STEP"))){
        step = atoi(tmp_value);
        do_benchmark = 1;
    }
    step = get_multiple(step, local_work_size);

    if ((tmp_value = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL, DUR_CONFIG)))
        max_run_time = atoi(tmp_value) * 1000000000UL;

    fprintf(stderr, "Calculating best global work size (GWS) for LWS=%zd and max. %llu s duration.\n\n",
            local_work_size, max_run_time / 1000000000ULL);

    if (do_benchmark)
        fprintf(stderr, "Raw speed figures including buffer transfers:\n");

    for (num = get_step(num, step, 1); num; num = get_step(num, step, 0)) {
        //Check if hardware can handle the size we are going to try now.
        if (sizeof(sha256_password) * num * 1.2 > get_max_mem_alloc_size(ocl_gpu_id))
            break;

	if (! (run_time = gws_test(num, self)))
            continue;

        if (!do_benchmark)
            advance_cursor();

        SHAspeed = num / (run_time / 1000000000.);

        if (run_time < min_time)
            min_time = run_time;

        if (do_benchmark) {
            fprintf(stderr, "gws: %8zu\t%12lu c/s %8.3f ms per crypt_all()",
                    num, (long) (num / (run_time / 1000000000.)),
                    (float) run_time / 1000000.);

            if (run_time > max_run_time) {
                fprintf(stderr, " - too slow\n");
                break;
            }
        } else {
            if (run_time > min_time * 20 || run_time > max_run_time)
                break;
        }
        if (((long) SHAspeed - bestSHAspeed) > 10000) {
            if (do_benchmark)
                fprintf(stderr, "+");
            bestSHAspeed = SHAspeed;
            optimal_gws = num;
        }
        if (do_benchmark)
            fprintf(stderr, "\n");
    }
    fprintf(stderr, "Optimal global work size %d\n", optimal_gws);
    fprintf(stderr, "(to avoid this test on next run, put \""
        GWS_CONFIG " = %d\" in john.conf, section [" SECTION_OPTIONS
        SUBSECTION_OPENCL "])\n", optimal_gws);
    global_work_size = optimal_gws;
    create_clobj(optimal_gws, self);
}
static void find_best_workgroup(int jtrUniqDevNo, unsigned int gpu_perf) {
        size_t 		 _lws=0;
	cl_device_type 	 dTyp;
	cl_command_queue cmdq;
	cl_int 		 err;
	unsigned int 	 max_kpc
		       = get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf) < MAX_KEYS_PER_CRYPT ?
			 ((get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf)) / 8192 - 1) * 8192 :
			 MAX_KEYS_PER_CRYPT;
	cl_uint 	 *dcc_hash_host
		       = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536));
	cl_uint 	 *dcc2_hash_host
		       = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536));
	cl_uint		*hmac_sha1_out
		       = (cl_uint*)mem_alloc(5 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536));
	cl_uint salt_api[9], length = 10;

	event_ctr = 0;

	//HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info");
	dTyp = get_device_type(jtrUniqDevNo);
	if (dTyp == CL_DEVICE_TYPE_CPU)
		globalObj[jtrUniqDevNo].lws = 1;
	else
		globalObj[jtrUniqDevNo].lws = 16;

	///Set Dummy DCC hash , unicode salt and ascii salt(username) length
	memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536));
	memset(salt_api, 0xfe, 9 * sizeof(cl_uint));

	cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err);
	HANDLE_CLERROR(err, "Error creating command queue");

	PROFILE = 1;
	kernelExecTimeNs = CL_ULONG_MAX;

	///Find best local work size
	while (1) {
		_lws = globalObj[jtrUniqDevNo].lws;
		if (dTyp == CL_DEVICE_TYPE_CPU)
			exec_pbkdf2(dcc_hash_host, salt_api, length, 10240, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq, hmac_sha1_out);
		else
			exec_pbkdf2(dcc_hash_host, salt_api, length, 10240, dcc2_hash_host, (((max_kpc < 65536) ? max_kpc : 65536) / gpu_perf), jtrUniqDevNo, cmdq, hmac_sha1_out);

		if (globalObj[jtrUniqDevNo].lws <= _lws)
			break;
	}

	if (dTyp == CL_DEVICE_TYPE_CPU)
		globalObj[jtrUniqDevNo].exec_time_inv = globalObj[jtrUniqDevNo].exec_time_inv / 16;
	else
		globalObj[jtrUniqDevNo].exec_time_inv *= (((max_kpc < 65536) ? max_kpc : 65536) / (long double) gpu_perf) / 65536;

	PROFILE = 0;

	if (options.verbosity > 2) {
		fprintf(stderr, "Optimal Work Group Size:%d\n", (int)globalObj[jtrUniqDevNo].lws);
		fprintf(stderr, "Kernel Execution Speed (Higher is better):%Lf\n", globalObj[jtrUniqDevNo].exec_time_inv);
	}

	MEM_FREE(dcc_hash_host);
	MEM_FREE(dcc2_hash_host);
	MEM_FREE(hmac_sha1_out);
	HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed");
}
size_t 	select_device(int jtrUniqDevNo, struct fmt_main *fmt) {
	cl_int 		err;
	const char  	*errMsg;
	size_t	 	memAllocSz;

	active_dev_ctr++;

	opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevNo, NULL);

	globalObj[jtrUniqDevNo].krnl[0] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_short", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess_short FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[1] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_preprocess_long", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_preprocess_long FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[2] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_iter", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_iter FAILED\n");
		return 0;
	}
	globalObj[jtrUniqDevNo].krnl[3] = clCreateKernel(program[jtrUniqDevNo], "pbkdf2_postprocess", &err);
	if (err) {
		fprintf(stderr, "Create Kernel pbkdf2_postprocess FAILED\n");
		return 0;
	}

	errMsg = "Create Buffer FAILED";

	memAllocSz = 4 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu == (cl_mem)0)
		HANDLE_CLERROR(err,errMsg );
	globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_ONLY, (MAX_SALT_LENGTH / 2 + 1) * sizeof(cl_uint), NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_WRITE_ONLY, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	memAllocSz = MAX_KEYS_PER_CRYPT * sizeof(temp_buf);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);
	memAllocSz = 5 * MAX_KEYS_PER_CRYPT * sizeof(cl_uint);
	memAllocSz = memAllocSz < get_max_mem_alloc_size(jtrUniqDevNo) ? memAllocSz : get_max_mem_alloc_size(jtrUniqDevNo) / 4 * 4;
	globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu = clCreateBuffer(context[jtrUniqDevNo], CL_MEM_READ_WRITE, memAllocSz, NULL, &err);
	if (globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu == (cl_mem)0)
		HANDLE_CLERROR(err, errMsg);


	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 0 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.salt_gpu), "Set Kernel 0 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[0], 3, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 0 Arg 3 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.pass_gpu), "Set Kernel 1 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 1 Arg 1 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[1], 2, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hmac_sha1_gpu), "Set Kernel 1 Arg 2 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[2], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 2 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 0, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.temp_buf_gpu), "Set Kernel 3 Arg 0 :FAILED");
	HANDLE_CLERROR(clSetKernelArg(globalObj[jtrUniqDevNo].krnl[3], 1, sizeof(cl_mem), &globalObj[jtrUniqDevNo].gpu_buffer.hash_out_gpu), "Set Kernel 3 Arg 1 :FAILED");

	if (!local_work_size)
		find_best_workgroup(jtrUniqDevNo, quick_bechmark(jtrUniqDevNo));

	else {
		size_t 		maxsize, maxsize2;

		globalObj[jtrUniqDevNo].lws = local_work_size;

		// Obey limits
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[0], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize), &maxsize, NULL), "Error querying max LWS");
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[1], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[2], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;
		HANDLE_CLERROR(clGetKernelWorkGroupInfo(globalObj[jtrUniqDevNo].krnl[3], devices[jtrUniqDevNo], CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxsize2), &maxsize2, NULL), "Error querying max LWS");
		if (maxsize2 > maxsize)
			maxsize = maxsize2;

		while (globalObj[jtrUniqDevNo].lws > maxsize)
			globalObj[jtrUniqDevNo].lws /= 2;

		if (options.verbosity > 3)
			fprintf(stderr, "Local worksize (LWS) forced to "Zu"\n", globalObj[jtrUniqDevNo].lws);

		globalObj[jtrUniqDevNo].exec_time_inv = 1;
	}

	if (!global_work_size)
		find_best_gws(jtrUniqDevNo, fmt);

	else {
		if (options.verbosity > 3)
			fprintf(stderr, "Global worksize (GWS) forced to "Zu"\n", global_work_size);

		fmt -> params.max_keys_per_crypt = global_work_size;
		fmt -> params.min_keys_per_crypt = max_lws();
	}

	return globalObj[jtrUniqDevNo].lws;
}