Пример #1
0
static Benchmark *
setup_benchmark(opencl_desc *ocl, Settings *settings)
{
    Benchmark *b;
    cl_program program;
    cl_int errcode = CL_SUCCESS;

    program = ocl_get_program(ocl, "nlm.cl", "");

    if (program == NULL) {
        g_warning ("Could not open nlm.cl");
        ocl_free (ocl);
        return NULL;
    }

    b = (Benchmark *) g_malloc0(sizeof(Benchmark));
    b->ocl = ocl;
    b->settings = settings;

    /* Create kernel for each device */
    b->kernels = g_malloc0(ocl->num_devices * sizeof(cl_kernel));

    for (int i = 0; i < ocl->num_devices; i++) {
        b->kernels[i] = clCreateKernel(program, "nlm", &errcode);
        CHECK_ERROR(errcode);
    }

    b->num_images = b->settings->num_images < 0 ? ocl->num_devices * 16 : b->settings->num_images;
    b->image_size = b->settings->width * b->settings->height * sizeof(gfloat);
    b->single_result = (gfloat **) g_malloc0(b->num_images * sizeof(gfloat *));
    b->multi_result = (gfloat **) g_malloc0(b->num_images * sizeof(gfloat *));
    b->host_data = (gfloat **) g_malloc0(b->num_images * sizeof(gfloat *));
    b->events = (cl_event *) g_malloc0(b->num_images * sizeof(cl_event));
    b->read_events = (cl_event *) g_malloc0(b->num_images * sizeof(cl_event));
    b->write_events = (cl_event *) g_malloc0(b->num_images * sizeof(cl_event));

    g_print("# Computing <nlm> for %i images of size %ix%i\n", b->num_images, b->settings->width, b->settings->height);

    for (guint i = 0; i < b->num_images; i++) {
        b->host_data[i] = (gfloat *) g_malloc0(b->image_size);
        b->single_result[i] = (gfloat *) g_malloc0(b->image_size);
        b->multi_result[i] = (gfloat *) g_malloc0(b->image_size);

        for (guint j = 0; j < b->settings->width * b->settings->height; j++)
            b->host_data[i][j] = (gfloat) g_random_double();
    }

    b->dev_data_in = (cl_mem *) g_malloc0(ocl->num_devices * sizeof(cl_mem));
    b->dev_data_out = (cl_mem *) g_malloc0(ocl->num_devices * sizeof(cl_mem));

    for (guint i = 0; i < ocl->num_devices; i++) {
        b->dev_data_in[i] = clCreateBuffer(ocl->context, CL_MEM_READ_WRITE, b->image_size, NULL, &errcode);
        CHECK_ERROR(errcode);
        b->dev_data_out[i] = clCreateBuffer(ocl->context, CL_MEM_READ_WRITE, b->image_size, NULL, &errcode);
        CHECK_ERROR(errcode);
    }

    return b;
}
Пример #2
0
int
main (void)
{
    OclPlatform *ocl;
    cl_mem mem;
    cl_program program;
    cl_kernel kernel;
    cl_int errcode;
    cl_event event;
    size_t n_elements;

    cl_command_queue *cmd_queues;

    ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_ALL, 0);

    if (ocl == NULL)
        return 1;

    program = ocl_create_program_from_file (ocl, "test.cl", NULL, &errcode);
    OCL_CHECK_ERROR (errcode);

    cmd_queues = ocl_get_cmd_queues (ocl);
    kernel = clCreateKernel (program, "fill_ones", &errcode);
    OCL_CHECK_ERROR (errcode);

    n_elements = 1024 * 1024;
    mem = clCreateBuffer (ocl_get_context (ocl), CL_MEM_READ_WRITE,
                          n_elements * sizeof (float),
                          NULL, &errcode);

    OCL_CHECK_ERROR (clSetKernelArg (kernel, 0, sizeof (cl_mem), &mem));
    OCL_CHECK_ERROR (clEnqueueNDRangeKernel (cmd_queues[0], kernel,
                                             1, NULL, &n_elements, NULL,
                                             0, NULL, &event));
                                             
    OCL_CHECK_ERROR (clWaitForEvents (1, &event));

    /* 
     * If the event is not released, nvidia-smi will report that about 60 MB are
     * not freed, although we free all other resources including the memory
     * object itself.
     */
    /* OCL_CHECK_ERROR (clReleaseEvent (event)); */

    OCL_CHECK_ERROR (errcode);
    OCL_CHECK_ERROR (clReleaseMemObject (mem));
    OCL_CHECK_ERROR (clReleaseKernel (kernel));
    OCL_CHECK_ERROR (clReleaseProgram (program));

    ocl_free (ocl);

    fflush (stdin);
    printf ("Press Enter to exit ...\n");
    getchar ();

    return 0;
}
Пример #3
0
int
main (int argc, const char **argv)
{
    OclPlatform *ocl;
    cl_int errcode;
    cl_program program;
    const char *inname;
    size_t asize;
    size_t *sizes;
    char **binaries;

    if (argc < 2) {
        printf ("Usage: dump-opencl-binary input.cl\n");
        exit (0);
    }

    inname = argv[1];

    ocl = ocl_new (0, CL_DEVICE_TYPE_GPU);
    asize = sizeof (size_t) * ocl_get_num_devices (ocl);
    sizes = malloc (asize);
    binaries = malloc (sizeof (char *) * ocl_get_num_devices (ocl));

    program = ocl_create_program_from_file (ocl, inname, NULL, &errcode);
    OCL_CHECK_ERROR (errcode);

    OCL_CHECK_ERROR (clGetProgramInfo (program, CL_PROGRAM_BINARY_SIZES, asize, sizes, NULL));
    
    for (int i = 0; i < ocl_get_num_devices (ocl); i++)
        binaries[i] = malloc (sizes[i]);

    OCL_CHECK_ERROR (clGetProgramInfo (program, CL_PROGRAM_BINARIES, 0, binaries, NULL));

    for (int i = 0; i < ocl_get_num_devices (ocl); i++) {
        char fname[256];
        FILE *fp;

        snprintf (fname, 256, "%s.%i", inname, i);
        fp = fopen (fname, "wb");
        fwrite (binaries[0], sizes[i], 1, fp);
        fclose (fp);
    }

    OCL_CHECK_ERROR (clReleaseProgram (program));

    free (sizes);
    ocl_free (ocl);
}
Пример #4
0
int main(int argc, char *argv[])
{
    static Settings settings = {
        .num_images = -1,
        .width = 1024,
        .height = 1024,
        .do_profile = FALSE
    };

    static GOptionEntry entries[] = {
        { "num-images", 'n', 0, G_OPTION_ARG_INT, &settings.num_images, "Number of images", "N" },
        { "width", 'w', 0, G_OPTION_ARG_INT, &settings.width, "Width of imags", "W" },
        { "height", 'h', 0, G_OPTION_ARG_INT, &settings.height, "Height of images", "H" },
        { "enable-profiling", 'n', 0, G_OPTION_ARG_NONE, &settings.do_profile, "Enable profiling", NULL },
        { NULL }
    };

    GOptionContext *context;
    opencl_desc *ocl;
    Benchmark *benchmark;
    GError *error = NULL;

    context = g_option_context_new (" - test multi GPU performance");
    g_option_context_add_main_entries (context, entries, NULL);

    if (!g_option_context_parse (context, &argc, &argv, &error)) {
        g_print ("Option parsing failed: %s\n", error->message);
        return 1;
    }

    g_print("## %s@%s\n", g_get_user_name(), g_get_host_name());

    g_thread_init (NULL);

    ocl = ocl_new (settings.do_profile);
    benchmark = setup_benchmark (ocl, &settings);

    measure_benchmark ("Single GPU", execute_single_gpu, benchmark);
    measure_benchmark ("Single Threaded, Multi GPU", execute_multi_gpu_single_thread, benchmark);
    measure_benchmark ("Multi Threaded, Multi GPU", execute_multi_gpu_multi_thread, benchmark);

    teardown_benchmark(benchmark);

    ocl_free(ocl);
    return 0;
}
Пример #5
0
int
main (void)
{
    OclPlatform *ocl;
    Data *data;

    ocl = ocl_new (0, CL_DEVICE_TYPE_ALL);

    if (ocl == NULL)
        return 1;

    data = setup_data (ocl, 4096 * 2048);

    run_benchmark (setup_single_blocking_queue, "Single blocking queue: %fs\n", data);
    run_benchmark (setup_ooo_queue, "Single out-of-order queue: %fs\n", data);
    run_benchmark (setup_two_queues, "Two queues: %fs\n", data);
    run_benchmark (setup_three_queues, "Three queues: %fs\n", data);

    free_data (data);
    ocl_free (ocl);

    return 0;
}
Пример #6
0
void widget_window_ocl_free(widget *w)
{
	OCLPROGRAM *m = w->data2;
	ocl_free(m);
}
Пример #7
0
int
main (int argc, const char **argv)
{
    OclPlatform *ocl;
    cl_program program;
    cl_device_id *devices;
    cl_command_queue *queues;
    cl_kernel kernel;
    cl_int errcode;
    int num_devices;
    GTimer *timer;

    ocl = ocl_new_from_args (argc, argv, CL_QUEUE_PROFILING_ENABLE);

    program = ocl_create_program_from_source (ocl, source, NULL, &errcode);
    OCL_CHECK_ERROR (errcode);

    kernel = clCreateKernel (program, "touch", &errcode);
    OCL_CHECK_ERROR (errcode);

    num_devices = ocl_get_num_devices (ocl);
    devices = ocl_get_devices (ocl);
    queues = ocl_get_cmd_queues (ocl);
    timer = g_timer_new ();

    for (int i = 0; i < num_devices; i++) {
        char name[256];
        cl_event event;
        size_t size = 16;
        const int NUM_RUNS = 50000;
        unsigned long total_wait = 0;
        unsigned long total_execution = 0;
        double wall_clock = 0.0;

        for (int r = 0; r < NUM_RUNS; r++) {
            unsigned long wait;
            unsigned long execution;

            g_timer_start (timer);
            OCL_CHECK_ERROR (clEnqueueNDRangeKernel (queues[i], kernel, 
                        1, NULL, &size, NULL,
                        0, NULL, &event));

            clWaitForEvents (1, &event);
            g_timer_stop (timer);

            wall_clock += g_timer_elapsed (timer, NULL);

            get_event_times (event, &wait, &execution);
            clReleaseEvent (event);

            total_wait += wait;
            total_execution += execution;
        }

        OCL_CHECK_ERROR (clGetDeviceInfo (devices[i], CL_DEVICE_NAME, 256, name, NULL));

        /* all times in nano seconds */
        printf ("%s %f %f %f\n", name,
                total_wait / ((double) NUM_RUNS),
                total_execution / ((double) NUM_RUNS),
                wall_clock / NUM_RUNS * 1000 * 1000 * 1000);
    }

    g_timer_destroy (timer);
    clReleaseKernel (kernel);
    clReleaseProgram (program);

    ocl_free (ocl);
}
Пример #8
0
int main(int argc, char* argv[])
{



		const size_t SIZE_execution_bit = (input_length - 3*filter_length +1);
		const size_t SIZE_input_bit = sizeof(gint32)*(input_length+1);
		const size_t SIZE_settings_bit = sizeof(gint32)*4;

		size_t output_bit_on_counts;
		size_t* SIZE_execution_pointer = &SIZE_execution_bit;

		gint32* filtersettings = (gint32*) malloc(SIZE_settings_bit);
		gint32* input_vector = (gint32*) malloc(SIZE_input_bit);
		gint32* positions = (gint32*) malloc(SIZE_input_bit);

		filtersettings[0] = filter_length;
		filtersettings[1] = threshhold;
		filtersettings[2] = input_length;
		filtersettings[3] = 0;



		//GPU-Init
		ocl = ocl_new(CL_DEVICE_TYPE_GPU,1);
		context = ocl_get_context(ocl);
		queue = ocl_get_cmd_queues (ocl)[0];
		clFinish(queue);

		program = ocl_create_program_from_file(ocl, "edel_kernel_secondder.cl", NULL, &errcode);
		OCL_CHECK_ERROR(errcode);

		filter1 = clCreateKernel(program, "second_filter", &errcode);
		OCL_CHECK_ERROR(errcode);

		//GPU-Buffer which can be done before the Computation
		settings = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, SIZE_settings_bit, filtersettings, &errcode);
		OCL_CHECK_ERROR(errcode);

		input = clCreateBuffer(context, CL_MEM_READ_ONLY, SIZE_input_bit, NULL, &errcode);
		OCL_CHECK_ERROR(errcode);


		if(debugmode != 0)
		{
			srand((unsigned) time( NULL ));
			counter = rand_rects(expected,1,input_length,3*filter_length,3*filter_length,3*filter_length,peak_length,base+peak, input_vector, noise, base, 0,positions);
			if(harddebug != 0)
			{
				for(i = 0; i < input_length;i++)
				{
					if(input_length < 10000)
					{
						printf("input_vector[%i] = %d\n",i,input_vector[i]);
					}
					else
					{
						printf("input_vector[%i] = %d\t",i,input_vector[i]);
					}
				}
			}

			printf("\n counts = %d\n", counter);
			printf("%lu Bits needed for Output-Vector \n", output_bit_on_counts);

		}

		output_bit_on_counts = sizeof(gint32) * safetyfactor * 2*((counter + 2));

		clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, SIZE_input_bit, input_vector, 0, NULL, NULL);

		gint32* energy_time = (gint32*)malloc(output_bit_on_counts);


		for(i = 0; i < safetyfactor * (2*counter+2); i++)
		{
			energy_time[i] = -9999;
		}


		output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, output_bit_on_counts, NULL , &errcode);
		OCL_CHECK_ERROR(errcode);


		OCL_CHECK_ERROR(clSetKernelArg(filter1, 0, sizeof(cl_mem), &input));
		OCL_CHECK_ERROR(clSetKernelArg(filter1, 1, sizeof(cl_mem), &output));
		OCL_CHECK_ERROR(clSetKernelArg(filter1, 2, sizeof(cl_mem), &settings));

		size_t local_item_size;
		size_t global_item_size = (size_t) (input_length - 3*filter_length +1);

		local_item_size = ocl_get_local_size(global_item_size, 2,1);

		             
                if(debugmode != 0)
                {
                        printf("local item size = %lu \n %lu", &local_item_size, local_item_size);
                        if(local_item_size != 0)
                        {
                              printf("This works because you divide %lu / %lu \n and this is %lu", global_item_size,local_item_size, global_item_size/local_item_size);
                        }
                        else
                        {
                              	FILE* attention;
				attention = fopen("filterlengthbad", "a+");
				if(attention == NULL)
				{
					printf("error in opening debug file \n");
					exit(1);
				}
				fprintf(attention, "The filterlength %d is not good for this filter, choose another filterlength ! \n", filter_length);
				fclose(attention);
				printf("There is no way to fit it evenly divided to workgroups, just let OpenCL do it \n");
                        }
                        if(harddebug != 0)
                        {
                                getchar();
                        }

                }


		if(local_item_size == 0)
		{
			OCL_CHECK_ERROR(clEnqueueNDRangeKernel(queue, filter1, 1, NULL, &global_item_size, NULL, 0, NULL, NULL));	
		}
		else
		{
			OCL_CHECK_ERROR(clEnqueueNDRangeKernel(queue, filter1, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL));
		}


		
		
		//local_item_size = NULL;	
		clEnqueueReadBuffer(queue, output, CL_TRUE, 0, output_bit_on_counts, energy_time, 0, NULL, NULL);
		clEnqueueReadBuffer(queue, settings, CL_TRUE, 0, SIZE_settings_bit, filtersettings, 0, NULL, NULL);


		//Writing back the data
		for(i = 0; i < filtersettings[3]; i++)
		{
			writing_back(filemode, filename, filename_e,filename_t, energy_time,i);
		}

		if(debugmode != 0)
		{
			printf("The Positions are:\n");
			for(i=0; i < counter; i++)
			{
				printf("%d\t", positions[i]);
				printf("note that this postion is the middle of the rect \n");
			}
		}
		//Safetychanges
		if(filtersettings[3] > counter)
		{
			safetyfactor = safetyfactor + 5*(filtersettings[3] - counter);
			if(safetyfactor <= 0)
			{
				safetyfactor = 10;
			}

			notexpect = filtersettings[3] - expected;
			if(safemode != 0 && notexpect >= notexpect_max)
			{
				printf("The Filter found to many peaks it. It expected %d. It found %d times more than expected.\n", expected, notexpect);
				printf("Safemode is on. Exit program \n");
				OCL_CHECK_ERROR(clReleaseMemObject(input));
				OCL_CHECK_ERROR(clReleaseMemObject(output));
				OCL_CHECK_ERROR(clReleaseMemObject(settings));
				OCL_CHECK_ERROR(clReleaseKernel(filter1));
				OCL_CHECK_ERROR(clReleaseProgram(program));

				ocl_free(ocl);

				free(input_vector);
				free(energy_time);
				free(positions);
				free(filtersettings);

			}
			else
			{
				printf("The Filter found to many peaks it. It expected %d. It found %d times more than expected \n", expected, notexpect);
			}
		}






		OCL_CHECK_ERROR(clReleaseMemObject(input));
		OCL_CHECK_ERROR(clReleaseMemObject(output));
		OCL_CHECK_ERROR(clReleaseMemObject(settings));
		OCL_CHECK_ERROR(clReleaseKernel(filter1));
		OCL_CHECK_ERROR(clReleaseProgram(program));

		ocl_free(ocl);
		free(input_vector);
		free(energy_time);
		free(positions);
		free(filtersettings);




}