void opencl_codelet(void *descr[], void *_args)
{
	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;
	int id, devid, err;

        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "incrementer", devid);
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=4;
		size_t local, s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);
                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global) local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

		starpu_opencl_release_kernel(kernel);
	}
}
void opencl_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
	STARPU_SKIP_IF_VALGRIND;

	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;
	int id, devid, err;

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incC", devid);
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=100;
		size_t local=100;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	unsigned ret;
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	ret = async_channel->event.finished;
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
	return ret;
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
	unsigned success = 0;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;
		cudaError_t cures = cudaEventQuery(event);

		success = (cures == cudaSuccess);
		if (success)
			cudaEventDestroy(event);
		else if (cures != cudaErrorNotReady)
			STARPU_CUDA_REPORT_ERROR(cures);
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int event_status;
		cl_event opencl_event = (*async_channel).event.opencl_event;
		if (opencl_event == NULL) STARPU_ABORT();
		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		if (event_status < 0)
			STARPU_OPENCL_REPORT_ERROR(event_status);
		success = (event_status == CL_COMPLETE);
		break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_DISK_RAM:
		success = starpu_disk_test_request(async_channel);
		break;
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}

	return success;
#endif /* !SIMGRID */
}
void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	while (!async_channel->event.finished)
		STARPU_PTHREAD_COND_WAIT(&async_channel->event.cond, &async_channel->event.mutex);
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
	cudaError_t cures;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;

		cures = cudaEventSynchronize(event);
		if (STARPU_UNLIKELY(cures))
			STARPU_CUDA_REPORT_ERROR(cures);

		cures = cudaEventDestroy(event);
		if (STARPU_UNLIKELY(cures))
			STARPU_CUDA_REPORT_ERROR(cures);

		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int err;
		if ((*async_channel).event.opencl_event == NULL)
			STARPU_ABORT();
		err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		err = clReleaseEvent((*async_channel).event.opencl_event);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
	      break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_MAIN_RAM:
		starpu_disk_wait_request(async_channel);
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}
#endif /* !SIMGRID */
}
int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
{
#if defined(CL_PROFILING_CLOCK_CYCLE_COUNT)||defined(CL_PROFILING_STALL_CYCLE_COUNT)||defined(CL_PROFILING_POWER_CONSUMED)
	struct starpu_task *task = starpu_task_get_current();
	struct starpu_profiling_task_info *info = task->profiling_info;
#endif

#ifdef CL_PROFILING_CLOCK_CYCLE_COUNT
	if (starpu_profiling_status_get() && info)
	{
		cl_int err;
		unsigned int clock_cycle_count;
		size_t size;
		err = clGetEventProfilingInfo(event, CL_PROFILING_CLOCK_CYCLE_COUNT, sizeof(clock_cycle_count), &clock_cycle_count, &size);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
		STARPU_ASSERT(size == sizeof(clock_cycle_count));
		info->used_cycles += clock_cycle_count;
	}
#endif
#ifdef CL_PROFILING_STALL_CYCLE_COUNT
	if (starpu_profiling_status_get() && info)
	{
		cl_int err;
		unsigned int stall_cycle_count;
		size_t size;
		err = clGetEventProfilingInfo(event, CL_PROFILING_STALL_CYCLE_COUNT, sizeof(stall_cycle_count), &stall_cycle_count, &size);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
		STARPU_ASSERT(size == sizeof(stall_cycle_count));

		info->stall_cycles += stall_cycle_count;
	}
#endif
#ifdef CL_PROFILING_POWER_CONSUMED
	if (info && (starpu_profiling_status_get() || (task->cl && task->cl->power_model && task->cl->power_model->benchmarking)))
	{
		cl_int err;
		double power_consumed;
		size_t size;
		err = clGetEventProfilingInfo(event, CL_PROFILING_POWER_CONSUMED, sizeof(power_consumed), &power_consumed, &size);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
		STARPU_ASSERT(size == sizeof(power_consumed));

		info->power_consumed += power_consumed;
	}
#endif

	return 0;
}
void
opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)
{
#if 0
	size_t dim[] = {nx, ny, nz};
#else
	size_t dim[] = {nx, ny, 1};
#endif

        int devid,id;
        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

        cl_kernel kernel;
        cl_command_queue cq;
        starpu_opencl_load_kernel(&kernel, &cq, &program, "shadow", devid);

        clSetKernelArg(kernel, 0, sizeof(bz), &bz);
        clSetKernelArg(kernel, 1, sizeof(ptr), &ptr);
        clSetKernelArg(kernel, 2, sizeof(nx), &nx);
        clSetKernelArg(kernel, 3, sizeof(ny), &ny);
        clSetKernelArg(kernel, 4, sizeof(nz), &nz);
        clSetKernelArg(kernel, 5, sizeof(ldy), &ldy);
        clSetKernelArg(kernel, 6, sizeof(ldz), &ldz);
        clSetKernelArg(kernel, 7, sizeof(i), &i);

        cl_event ev;
        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, NULL);
        if (err != CL_SUCCESS)
                STARPU_OPENCL_REPORT_ERROR(err);
}
void axpy_opencl(void *buffers[], void *_args)
{
	TYPE *alpha = _args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
	err|= clSetKernelArg(kernel, 2, sizeof(n), &n);
	err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global)
			local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
Exemple #8
0
int starpu_opencl_release_kernel(cl_kernel kernel) {
    cl_int err;

    err = clReleaseKernel(kernel);
    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

    return CL_SUCCESS;
}
void redux_opencl_func(void *buffers[], void *args)
{
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global)
			local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
int _starpu_opencl_init_context(int devid)
{
#ifdef STARPU_SIMGRID
	int j;
	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
	{
		task_finished[devid][j] = 0;
		STARPU_PTHREAD_MUTEX_INIT(&task_mutex[devid][j], NULL);
		STARPU_PTHREAD_COND_INIT(&task_cond[devid][j], NULL);
	}
#else /* !STARPU_SIMGRID */
	cl_int err;
	cl_uint uint;

	STARPU_PTHREAD_MUTEX_LOCK(&big_lock);

        _STARPU_DEBUG("Initialising context for dev %d\n", devid);

        // Create a compute context
	err = 0;
        contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);
	starpu_malloc_set_align(uint/8);

        // Create execution queue for the given device
        queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        // Create transfer queue for the given device
        cl_command_queue_properties props;
        err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);
        props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
        in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

	STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
#endif /* !STARPU_SIMGRID */
	return 0;
}
void opencl_codelet(void *descr[], void *_args)
{
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;
	int id, devid, err;
	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
        int ldy = (int)STARPU_BLOCK_GET_LDY(descr[0]);
        int ldz = (int) STARPU_BLOCK_GET_LDZ(descr[0]);
        float *multiplier = (float *)_args;

        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

        err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);
        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block);
	CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx);
	CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny);
	CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz);
	CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy);
	CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz);
	CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*multiplier), multiplier);

	{
                size_t global=nx*ny*nz;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

        starpu_opencl_release_kernel(kernel);
}
Exemple #12
0
void opencl_codelet(void *descr[], void *_args)
{
	cl_kernel kernel;
	cl_command_queue queue;
	int id, devid, err, n;
	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
        unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
        unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
        float *multiplier = (float *)_args;

        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

        err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);
        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = 0;
        n=0;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
	err = clSetKernelArg(kernel, 6, sizeof(float), multiplier);
        if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
                size_t global=nx*ny*nz;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);

        starpu_opencl_release_kernel(kernel);
}
void vector_scal_opencl(void *buffers[], void *_args)
{
	float *factor = _args;
	int id, devid, err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	/* length of the vector */
	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
	/* OpenCL copy of the vector pointer */
	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
					"vector_mult_opencl", devid);   /* Name of the codelet defined above */
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=1;
		size_t local=1;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
}
static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
{
	starpu_ssize_t limit;
	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
	char name[30];

#ifdef STARPU_SIMGRID
	totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid);
#elif defined(STARPU_USE_OPENCL)
	/* Request the size of the current device's memory */
	cl_int err;
	cl_ulong size;
	err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);
	totalGlobalMem = size;
#endif

	limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM");
	if (limit == -1)
	{
		sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
		limit = starpu_get_env_number(name);
	}
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
	if (limit == -1)
	{
		/* Use 90% of the available memory by default.  */
		limit = totalGlobalMem / (1024*1024) * 0.9;
	}
#endif

	global_mem[devid] = limit * 1024*1024;

#ifdef STARPU_USE_OPENCL
	/* How much memory to waste ? */
	to_waste = totalGlobalMem - global_mem[devid];
#endif

	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
			(long)(totalGlobalMem - to_waste)/(1024*1024));

}
Exemple #15
0
int starpu_opencl_load_opencl_from_string(char *opencl_program_source, struct starpu_opencl_program *opencl_programs)
{
    unsigned int dev;
    unsigned int nb_devices;

    nb_devices = _starpu_opencl_get_device_count();
    // Iterate over each device
    for(dev = 0; dev < nb_devices; dev ++) {
        cl_device_id device;
        cl_context   context;
        cl_program   program;
        cl_int       err;

        starpu_opencl_get_device(dev, &device);
        starpu_opencl_get_context(dev, &context);
        opencl_programs->programs[dev] = NULL;

        if (context == NULL) continue;

        // Create the compute program from the source buffer
        program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err);
        if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

        // Build the program executable
        err = clBuildProgram(program, 1, &device, "-Werror -cl-mad-enable", NULL, NULL);
        if (err != CL_SUCCESS) {
            size_t len;
            static char buffer[4096];

            _STARPU_DISP("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

            _STARPU_DISP("<%s>\n", buffer);
            return EXIT_FAILURE;
        }

        // Store program
        opencl_programs->programs[dev] = program;
    }
    return EXIT_SUCCESS;
}
void init_opencl_func(void *buffers[], void *args)
{
        cl_int err;
	cl_command_queue queue;

	cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
	starpu_opencl_get_current_queue(&queue);
	DOT_TYPE zero = (DOT_TYPE) 0.0;

	err = clEnqueueWriteBuffer(queue,
			dot,
			CL_TRUE,
			0,
			sizeof(DOT_TYPE),
			&zero,
			0,
			NULL,
			NULL);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

}
int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
{
	unsigned int dev;
	unsigned int nb_devices;

	if (!starpu_opencl_worker_get_count())
		return 0;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		if (opencl_programs->programs[dev])
		{
			cl_int err;
			err = clReleaseProgram(opencl_programs->programs[dev]);
			if (STARPU_UNLIKELY(err != CL_SUCCESS))
				STARPU_OPENCL_REPORT_ERROR(err);
		}
	}
	return 0;
}
static
int _starpu_opencl_get_binary_name(char *binary_file_name, size_t maxlen, const char *source_file_name, int dev, cl_device_id device)
{
	char binary_directory[1024];
	char *p;
	cl_int err;
	cl_uint vendor_id;

	_starpu_opencl_create_binary_directory(binary_directory, 1024);

	p = strrchr(source_file_name, '/');
	snprintf(binary_file_name, maxlen, "%s/%s", binary_directory, p?p:source_file_name);

	p = strstr(binary_file_name, ".cl");
	if (p == NULL) p=binary_file_name + strlen(binary_file_name);

	err = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

	sprintf(p, ".%s.vendor_id_%d_device_id_%d", _starpu_opencl_get_device_type_as_string(dev), (int)vendor_id, dev);

	return CL_SUCCESS;
}
Exemple #19
0
int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs,
                              char *kernel_name, int devid)
{
    int err;
    cl_device_id device;
    cl_context context;
    cl_program program;

    starpu_opencl_get_device(devid, &device);
    starpu_opencl_get_context(devid, &context);
    starpu_opencl_get_queue(devid, queue);

    program = opencl_programs->programs[devid];
    if (!program) {
        _STARPU_DISP("Program not available\n");
        return CL_INVALID_PROGRAM;
    }

    // Create the compute kernel in the program we wish to run
    *kernel = clCreateKernel(program, kernel_name, &err);
    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

    return CL_SUCCESS;
}
int _starpu_opencl_deinit_context(int devid)
{
#ifdef STARPU_SIMGRID
	int j;
	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
	{
		task_finished[devid][j] = 0;
		STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[devid][j]);
		STARPU_PTHREAD_COND_DESTROY(&task_cond[devid][j]);
	}
#else /* !STARPU_SIMGRID */
        cl_int err;

	STARPU_PTHREAD_MUTEX_LOCK(&big_lock);

        _STARPU_DEBUG("De-initialising context for dev %d\n", devid);

        err = clReleaseContext(contexts[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clReleaseCommandQueue(queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clReleaseCommandQueue(in_transfer_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        err = clReleaseCommandQueue(out_transfer_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        err = clReleaseCommandQueue(peer_transfer_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clReleaseCommandQueue(alloc_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        contexts[devid] = NULL;

	STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
#endif

        return 0;
}
static
int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program_source, const char* build_options,
						      struct starpu_opencl_program *opencl_programs, const char* source_file_name)
{
	unsigned int dev;
	unsigned int nb_devices;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		cl_device_id device;
		cl_context   context;
		cl_program   program;
		cl_int       err;

		if (opencl_programs)
			opencl_programs->programs[dev] = NULL;

		starpu_opencl_get_device(dev, &device);
		starpu_opencl_get_context(dev, &context);
		if (context == NULL)
		{
			_STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
			continue;
		}

		// Create the compute program from the source buffer
		program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err);
		if (!program || err != CL_SUCCESS)
		{
			_STARPU_DISP("Error: Failed to load program source with options %s!\n", build_options);
			return EXIT_FAILURE;
		}

		// Build the program executable
		err = clBuildProgram(program, 1, &device, build_options, NULL, NULL);

		// Get the status
		{
			cl_build_status status;
			size_t len;
			static char buffer[4096] = "";

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			if (len > 2)
				_STARPU_DISP("Compilation output\n%s\n", buffer);

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
			if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS)
			{
				_STARPU_DISP("Error: Failed to build program executable!\n");
				_STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status);
				return EXIT_FAILURE;
			}
		}

		// Store program
		if (opencl_programs)
			opencl_programs->programs[dev] = program;
		else
		{
			char binary_file_name[1024];
			char *binary;
			size_t binary_len;
			FILE *fh;

			err = _starpu_opencl_get_binary_name(binary_file_name, 1024, source_file_name, dev, device);
			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

			err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL);
			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			binary = malloc(binary_len);

			err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL);
			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

			fh = fopen(binary_file_name, "w");
			if (fh == NULL)
			{
				_STARPU_DISP("Error: Failed to open file <%s>\n", binary_file_name);
				perror("fopen");
				return EXIT_FAILURE;
			}
			fwrite(binary, binary_len, 1, fh);
			fclose(fh);
			free(binary);
			_STARPU_DEBUG("File <%s> created\n", binary_file_name);
		}
	}
	return EXIT_SUCCESS;
}
int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs)
{
	unsigned int dev;
	unsigned int nb_devices;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		cl_device_id device;
		cl_context   context;
		cl_program   program;
		cl_int       err;
		char        *binary;
		char         binary_file_name[1024];
		size_t       length;
		cl_int       binary_status;

		opencl_programs->programs[dev] = NULL;

		starpu_opencl_get_device(dev, &device);
		starpu_opencl_get_context(dev, &context);
		if (context == NULL)
		{
			_STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
			continue;
		}

		// Load the binary buffer
		err = _starpu_opencl_get_binary_name(binary_file_name, 1024, kernel_id, dev, device);
		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
		binary = _starpu_opencl_load_program_binary(binary_file_name, &length);

		// Create the compute program from the binary buffer
		program = clCreateProgramWithBinary(context, 1, &device, &length, (const unsigned char **) &binary, &binary_status, &err);
		if (!program || err != CL_SUCCESS)
		{
			_STARPU_DISP("Error: Failed to load program binary!\n");
			return EXIT_FAILURE;
		}

		// Build the program executable
		err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

		// Get the status
		{
			cl_build_status status;
			size_t len;
			static char buffer[4096] = "";

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			if (len > 2)
				_STARPU_DISP("Compilation output\n%s\n", buffer);

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
			if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS)
			{
				_STARPU_DISP("Error: Failed to build program executable!\n");
				_STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status);
				return EXIT_FAILURE;
			}
		}

		// Store program
		opencl_programs->programs[dev] = program;
	}
	return 0;
}
void test_variable_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	int id, devid, ret;
	int factor = *(int *) args;

        cl_int             err;
	cl_kernel          kernel;
	cl_command_queue   queue;
	cl_event           event;

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]);

	cl_context context;
	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);
	starpu_opencl_get_context(devid, &context);

	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
		sizeof(int), &variable_config.copy_failed, &err);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"variable_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(fail), &fail);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global = 1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	err = clEnqueueReadBuffer(queue,
				  fail,
				  CL_TRUE,
				  0, 
				  sizeof(int),
				  &variable_config.copy_failed,
				  0,
				  NULL,
				  NULL);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
	return;
}
void multiformat_scal_opencl_func(void *buffers[], void *args)
{
	(void) args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"multiformat_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(n), &n);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
}
void cpu_to_opencl_opencl_func(void *buffers[], void *args)
{
	(void) args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = CUSTOM_GET_NX(buffers[0]);
	n*=2;
	struct point *aop;
	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&_opencl_conversion_program,
					"custom_opencl_conversion",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
	if (starpu_opencl_set_kernel_args(&err, &kernel,
					  sizeof(aop), &aop,
					  sizeof(x), &x,
					  sizeof(n), &n,
					  0) != 3)
	{
		STARPU_OPENCL_REPORT_ERROR(err);
		assert(0);
	}
	

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(
				queue,
				kernel,
				1,       /* work_dim */
				NULL,    /* global_work_offset */
				&global, /* global_work_size */
				&local,  /* local_work_size */
				0,       /* num_events_in_wait_list */
				NULL,    /* event_wait_list */
				NULL);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
void cpu_to_opencl_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	(void) args;
	int id, devid, ret;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION,
						  &opencl_conversion_program,
						  NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_conversion_program,
					"cpu_to_opencl_opencl",
					devid);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(src), &src);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(dst), &dst);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_conversion_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
}