void dw_cublas_codelet_update_u11(void *descr[], void *_args)
{
	dw_common_codelet_update_u11(descr, 1, _args);

	int id = starpu_worker_get_id();
	count_11_per_worker[id]++;
}
void
opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, int i)
{
#if 0
	size_t dim[] = {nx, ny, nz};
#else
	size_t dim[] = {nx, ny, 1};
#endif

        int devid,id;
        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

        cl_kernel kernel;
        cl_command_queue cq;
        starpu_opencl_load_kernel(&kernel, &cq, &program, "shadow", devid);

        clSetKernelArg(kernel, 0, sizeof(bz), &bz);
        clSetKernelArg(kernel, 1, sizeof(ptr), &ptr);
        clSetKernelArg(kernel, 2, sizeof(nx), &nx);
        clSetKernelArg(kernel, 3, sizeof(ny), &ny);
        clSetKernelArg(kernel, 4, sizeof(nz), &nz);
        clSetKernelArg(kernel, 5, sizeof(ldy), &ldy);
        clSetKernelArg(kernel, 6, sizeof(ldz), &ldz);
        clSetKernelArg(kernel, 7, sizeof(i), &i);

        cl_event ev;
        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, NULL);
        if (err != CL_SUCCESS)
                STARPU_OPENCL_REPORT_ERROR(err);
}
Пример #3
0
void callback_func(void *arg)
{
	/* do some accounting */
	int id = starpu_worker_get_id();
	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
}
void dw_cpu_codelet_update_u21(void *descr[], void *_args)
{
	dw_common_codelet_update_u21(descr, 0, _args);

	int id = starpu_worker_get_id();
	count_21_per_worker[id]++;
}
void opencl_codelet(void *descr[], void *_args)
{
	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;
	int id, devid, err;

        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "incrementer", devid);
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=4;
		size_t local, s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);
                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global) local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

		starpu_opencl_release_kernel(kernel);
	}
}
void opencl_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
	STARPU_SKIP_IF_VALGRIND;

	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[0]);
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;
	int id, devid, err;

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incC", devid);
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=100;
		size_t local=100;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
static
int ws_push_task(struct starpu_task *task)
{
	unsigned sched_ctx_id = task->sched_ctx;
	struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);

	struct _starpu_deque_jobq *deque_queue;
	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
	int workerid = starpu_worker_get_id();

	unsigned worker = 0;
	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
	struct starpu_sched_ctx_iterator it;
	
	workers->init_iterator(workers, &it);
	/* !! C'est ballot de tout locker! */
	while(workers->has_next(workers, &it))
	{
		worker = workers->get_next(workers, &it);
		starpu_pthread_mutex_t *sched_mutex;
		starpu_pthread_cond_t *sched_cond;
		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
		STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
	}
	
	
	/* If the current thread is not a worker but
	 * the main thread (-1), we find the better one to
	 * put task on its queue */
	if (workerid == -1)
		workerid = select_worker(sched_ctx_id);

	deque_queue = ws->queue_array[workerid];

#ifdef HAVE_AYUDAME_H
	if (AYU_event)
	{
		intptr_t id = workerid;
		AYU_event(AYU_ADDTASKTOQUEUE, j->job_id, &id);
	}
#endif
	_starpu_job_list_push_back(&deque_queue->jobq, j);
	deque_queue->njobs++;
	starpu_push_task_end(task);

	while(workers->has_next(workers, &it))
	{
		worker = workers->get_next(workers, &it);
		starpu_pthread_mutex_t *sched_mutex;
		starpu_pthread_cond_t *sched_cond;
		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
#ifndef STARPU_NON_BLOCKING_DRIVERS
		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
#endif
		STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
	}
		
	return 0;
}
void master_g(void *arg)
{
	(void) arg;
	int worker_id;
	pthread_t tid;
	tid = pthread_self();
	worker_id = starpu_worker_get_id();
	printf("[tid %p] task thread = %d -- master\n", (void *)tid, worker_id);
}
void task_region_g(void *buffers[], void *args)
{
	(void) buffers;
	int i = (int)(intptr_t) args;
	int worker_id;
	pthread_t tid;
	tid = pthread_self();
	worker_id = starpu_worker_get_id();
	printf("[tid %p] task thread = %d: explicit task \"g[%d]\"\n", (void *)tid, worker_id, i);
}
static void soclCreateKernel_task(void *data) {
   struct _cl_kernel *k = (struct _cl_kernel *)data;

   int range = starpu_worker_get_range();
   cl_int err;

   if (k->program->cl_programs[range] == NULL) {
      k->errcodes[range] = CL_SUCCESS;
      DEBUG_MSG("[Device %d] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id());
      return;
   }

   DEBUG_MSG("[Device %d] Creating kernel...\n", starpu_worker_get_id());
   k->cl_kernels[range] = clCreateKernel(k->program->cl_programs[range], k->kernel_name, &err);
   if (err != CL_SUCCESS) {
      k->errcodes[range] = err;
      ERROR_STOP("[Device %d] Unable to create kernel. Error %d. Aborting.\n", starpu_worker_get_id(), err);
      return;
   }

   /* One worker creates argument structures */
   if (__sync_bool_compare_and_swap(&k->num_args, 0, 666)) {
      unsigned int i;
      cl_uint num_args;

      err = clGetKernelInfo(k->cl_kernels[range], CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
      if (err != CL_SUCCESS) {
         DEBUG_CL("clGetKernelInfo", err);
         ERROR_STOP("Unable to get kernel argument count. Aborting.\n");
      }
      k->num_args = num_args;
      DEBUG_MSG("Kernel has %d arguments\n", num_args);

      k->arg_size = (size_t*)malloc(sizeof(size_t) * num_args);
      k->arg_value = (void**)malloc(sizeof(void*) * num_args);
      k->arg_type = (enum kernel_arg_type*)malloc(sizeof(enum kernel_arg_type) * num_args);
      /* Settings default type to NULL */
      for (i=0; i<num_args; i++) {
         k->arg_value[i] = NULL;
         k->arg_type[i] = Null;
      }
   }
}
void parallel_region_f(void *buffers[], void *args)
{
	(void) buffers;
	(void) args;
	int worker_id;
	pthread_t tid;
	tid = pthread_self();
	worker_id = starpu_worker_get_id();
	printf("[tid %p] task thread = %d -- parallel -->\n", (void *)tid, worker_id);
	starpu_omp_master(master_g, NULL);
	starpu_omp_master(master_g, NULL);
	starpu_omp_master(master_g, NULL);
	starpu_omp_master(master_g, NULL);
	printf("[tid %p] task thread = %d -- parallel <--\n", (void *)tid, worker_id);
}
Пример #12
0
void axpy_opencl(void *buffers[], void *_args)
{
	TYPE *alpha = _args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
	err|= clSetKernelArg(kernel, 2, sizeof(n), &n);
	err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global)
			local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
Пример #13
0
static void common_data_cpy_func(void *descr[], void *cl_arg)
{
	unsigned interface_id = *(unsigned *)cl_arg;

	const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id);
	const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods;

	int workerid = starpu_worker_get_id();
	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
	unsigned memory_node = starpu_worker_get_memory_node(workerid);

	void *dst_interface = descr[0];
	void *src_interface = descr[1];

	switch (type)
	{
		case STARPU_CPU_WORKER:
			if (copy_methods->ram_to_ram)
			{
				copy_methods->ram_to_ram(src_interface, memory_node, dst_interface, memory_node);
				return;
			}
			break;
		case STARPU_CUDA_WORKER:
			if (copy_methods->cuda_to_cuda)
			{
				copy_methods->cuda_to_cuda(src_interface, memory_node, dst_interface, memory_node);
				return;
			}
			break;
		case STARPU_OPENCL_WORKER:
			if (copy_methods->opencl_to_opencl)
			{
				copy_methods->opencl_to_opencl(src_interface, memory_node, dst_interface, memory_node);
				return;
			}
			break;
		default:
			/* unknown architecture */
			STARPU_ABORT();
	}
	STARPU_ASSERT(copy_methods->any_to_any);
	copy_methods->any_to_any(src_interface, memory_node, dst_interface, memory_node, NULL);

}
void redux_opencl_func(void *buffers[], void *args)
{
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
	cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
	err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global)
			local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
static struct starpu_task * best_implementation_pull_task(struct starpu_sched_component * component)
{
	struct starpu_task * task = NULL;
	int i;
	for(i=0; i < component->nparents; i++)
	{
		if(component->parents[i] == NULL)
			continue;
		else
		{
			task = component->parents[i]->pull_task(component->parents[i]);
			if(task)
				break;
		}
	}
	if(task)
		/* this worker can execute this task as it was returned by a pop*/
		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id());
	return task;
}
Пример #16
0
void opencl_codelet(void *descr[], void *_args)
{
	cl_kernel kernel;
	cl_command_queue queue;
	int id, devid, err, n;
	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
        unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
        unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
        float *multiplier = (float *)_args;

        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

        err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);
        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = 0;
        n=0;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
	err = clSetKernelArg(kernel, 6, sizeof(float), multiplier);
        if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
                size_t global=nx*ny*nz;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);

        starpu_opencl_release_kernel(kernel);
}
void opencl_codelet(void *descr[], void *_args)
{
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;
	int id, devid, err;
	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
        int ldy = (int)STARPU_BLOCK_GET_LDY(descr[0]);
        int ldz = (int) STARPU_BLOCK_GET_LDZ(descr[0]);
        float *multiplier = (float *)_args;

        id = starpu_worker_get_id();
        devid = starpu_worker_get_devid(id);

        err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);
        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block);
	CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx);
	CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny);
	CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz);
	CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy);
	CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz);
	CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*multiplier), multiplier);

	{
                size_t global=nx*ny*nz;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

        starpu_opencl_release_kernel(kernel);
}
void parallel_region_f(void *buffers[], void *args)
{
	(void) buffers;
	(void) args;
	int worker_id;
	pthread_t tid;
	struct starpu_omp_task_region_attr attr;
	int i = 0;

	tid = pthread_self();
	worker_id = starpu_worker_get_id();
	printf("[tid %p] task thread = %d: implicit task \"f\"\n", (void *)tid, worker_id);
	
	starpu_omp_taskgroup_inline_begin();
	taskgroup_f((void *)&i);
	starpu_omp_taskgroup_inline_end();
	printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id);

	starpu_omp_taskgroup_inline_begin();
	taskgroup_f((void *)&i);
	starpu_omp_taskgroup_inline_end();
	printf("[tid %p] task thread = %d: implicit task \"f\": taskgroup\n", (void *)tid, worker_id);

	memset(&attr, 0, sizeof(attr));
	attr.cl.cpu_funcs[0]  = task_region_g;
	attr.cl.where         = STARPU_CPU;
	attr.cl_arg_size      = sizeof(void *);
	attr.cl_arg_free      = 0;
	attr.if_clause        = 1;
	attr.final_clause     = 0;
	attr.untied_clause    = 1;
	attr.mergeable_clause = 0;

	attr.cl_arg = (void *)(intptr_t)i++;
	starpu_omp_task_region(&attr);

	attr.cl_arg = (void *)(intptr_t)i++;
	starpu_omp_task_region(&attr);
}
void vector_scal_opencl(void *buffers[], void *_args)
{
	float *factor = _args;
	int id, devid, err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	/* length of the vector */
	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
	/* OpenCL copy of the vector pointer */
	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
					"vector_mult_opencl", devid);   /* Name of the codelet defined above */
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=1;
		size_t local=1;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
}
void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
	command_ndrange_kernel cmd = (command_ndrange_kernel)args;

   cl_command_queue cq;
   int wid;
   cl_int err;

  cl_event ev = command_event_get(cmd);
  ev->prof_start = _socl_nanotime();
  gc_entity_release(ev);

   wid = starpu_worker_get_id();
   starpu_opencl_get_queue(wid, &cq);

   DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id);

   int range = starpu_worker_get_range();

   /* Set arguments */
   {
	   unsigned int i;
	   int buf = 0;
	   for (i=0; i<cmd->num_args; i++) {
		   switch (cmd->arg_types[i]) {
			   case Null:
				   err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL);
				   break;
			   case Buffer: {
						cl_mem mem;  
						mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
						err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem);
						buf++;
					}
					break;
			   case Immediate:
					err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]);
					break;
		   }
		   if (err != CL_SUCCESS) {
			   DEBUG_CL("clSetKernelArg", err);
			   DEBUG_ERROR("Aborting\n");
		   }
	   }
   }

   /* Calling Kernel */
   cl_event event;
   err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event);

   if (err != CL_SUCCESS) {
	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
	   DEBUG_CL("clEnqueueNDRangeKernel", err);
	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
	   if (cmd->local_work_size != NULL)
		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
   }
   else {
      /* Waiting for kernel to terminate */
      clWaitForEvents(1, &event);
      clReleaseEvent(event);
   }
}
struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
{
	struct starpu_task *task;
	int worker_id;
	unsigned node;

	/* We can't tell in advance which task will be picked up, so we measure
	 * a timestamp, and will attribute it afterwards to the task. */
	int profiling = starpu_profiling_status_get();
	struct timespec pop_start_time;
	if (profiling)
		_starpu_clock_gettime(&pop_start_time);

pick:
	/* perhaps there is some local task to be executed first */
	task = _starpu_pop_local_task(worker);


	/* get tasks from the stacks of the strategy */
	if(!task)
	{
		struct _starpu_sched_ctx *sched_ctx ;
#ifndef STARPU_NON_BLOCKING_DRIVERS
		int been_here[STARPU_NMAX_SCHED_CTXS];
		int i;
		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			been_here[i] = 0;

		while(!task)
#endif
		{
			if(worker->nsched_ctxs == 1)
				sched_ctx = _starpu_get_initial_sched_ctx();
			else
			{
				while(1)
				{
					sched_ctx = _get_next_sched_ctx_to_pop_into(worker);

					if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1)
					{
						_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
						worker->removed_from_ctx[sched_ctx->id] = 0;
						sched_ctx = NULL;
					}
					else
						break;
				}
			}

			if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			{
				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
				{
					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
					_starpu_pop_task_end(task);
				}
			}
			
			if(!task)
			{
				/* it doesn't matter if it shares tasks list or not in the scheduler,
				   if it does not have any task to pop just get it out of here */
				/* however if it shares a task list it will be removed as soon as he 
				  finishes this job (in handle_job_termination) */
				if(worker->removed_from_ctx[sched_ctx->id])
				{
					_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
					worker->removed_from_ctx[sched_ctx->id] = 0;
				}
#ifdef STARPU_USE_SC_HYPERVISOR
				if(worker->pop_ctx_priority)
				{
					struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
					if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id))
					{
//					_STARPU_TRACE_HYPERVISOR_BEGIN();
						perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0);
//					_STARPU_TRACE_HYPERVISOR_END();
					}
				}
#endif //STARPU_USE_SC_HYPERVISOR
				
#ifndef STARPU_NON_BLOCKING_DRIVERS
				if(been_here[sched_ctx->id] || worker->nsched_ctxs == 1)
					break;

				been_here[sched_ctx->id] = 1;

#endif
			}
		}
	  }


	if (!task)
	{
		idle_start[worker->workerid] = starpu_timing_now();
		return NULL;
	}

	if(idle_start[worker->workerid] != 0.0)
	{
		double idle_end = starpu_timing_now();
		idle[worker->workerid] += (idle_end - idle_start[worker->workerid]);
		idle_start[worker->workerid] = 0.0;
	}
	

#ifdef STARPU_USE_SC_HYPERVISOR
	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;

	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_poped_task && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id))
	{
//		_STARPU_TRACE_HYPERVISOR_BEGIN();
		perf_counters->notify_poped_task(task->sched_ctx, worker->workerid);
//		_STARPU_TRACE_HYPERVISOR_END();
	}
#endif //STARPU_USE_SC_HYPERVISOR


	/* Make sure we do not bother with all the multiformat-specific code if
	 * it is not necessary. */
	if (!_starpu_task_uses_multiformat_handles(task))
		goto profiling;


	/* This is either a conversion task, or a regular task for which the
	 * conversion tasks have already been created and submitted */
	if (task->mf_skip)
		goto profiling;

	/*
	 * This worker may not be able to execute this task. In this case, we
	 * should return the task anyway. It will be pushed back almost immediatly.
	 * This way, we avoid computing and executing the conversions tasks.
	 * Here, we do not care about what implementation is used.
	 */
	worker_id = starpu_worker_get_id();
	if (!starpu_worker_can_execute_task_first_impl(worker_id, task, NULL))
		return task;

	node = starpu_worker_get_memory_node(worker_id);

	/*
	 * We do have a task that uses multiformat handles. Let's create the
	 * required conversion tasks.
	 */
	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
	unsigned i;
	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
	for (i = 0; i < nbuffers; i++)
	{
		struct starpu_task *conversion_task;
		starpu_data_handle_t handle;

		handle = STARPU_TASK_GET_HANDLE(task, i);
		if (!_starpu_handle_needs_conversion_task(handle, node))
			continue;
		conversion_task = _starpu_create_conversion_task(handle, node);
		conversion_task->mf_skip = 1;
		conversion_task->execute_on_a_specific_worker = 1;
		conversion_task->workerid = worker_id;
		/*
		 * Next tasks will need to know where these handles have gone.
		 */
		handle->mf_node = node;
		_starpu_task_submit_conversion_task(conversion_task, worker_id);
	}

	task->mf_skip = 1;
	starpu_task_list_push_back(&worker->local_tasks, task);
	STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
	goto pick;

profiling:
	if (profiling)
	{
		struct starpu_profiling_task_info *profiling_info;
		profiling_info = task->profiling_info;

		/* The task may have been created before profiling was enabled,
		 * so we check if the profiling_info structure is available
		 * even though we already tested if profiling is enabled. */
		if (profiling_info)
		{
			memcpy(&profiling_info->pop_start_time,
				&pop_start_time, sizeof(struct timespec));
			_starpu_clock_gettime(&profiling_info->pop_end_time);
		}
	}

	if(task->prologue_callback_pop_func)
		task->prologue_callback_pop_func(task->prologue_callback_pop_arg);

	return task;
}
void test_variable_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	int id, devid, ret;
	int factor = *(int *) args;

        cl_int             err;
	cl_kernel          kernel;
	cl_command_queue   queue;
	cl_event           event;

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]);

	cl_context context;
	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);
	starpu_opencl_get_context(devid, &context);

	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
		sizeof(int), &variable_config.copy_failed, &err);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"variable_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(fail), &fail);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(factor), &factor);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global = 1;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	err = clEnqueueReadBuffer(queue,
				  fail,
				  CL_TRUE,
				  0, 
				  sizeof(int),
				  &variable_config.copy_failed,
				  0,
				  NULL,
				  NULL);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
	return;
}
void multiformat_scal_opencl_func(void *buffers[], void *args)
{
	(void) args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_program,
					"multiformat_opencl",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(n), &n);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
}
void cpu_to_opencl_opencl_func(void *buffers[], void *args)
{
	(void) args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = CUSTOM_GET_NX(buffers[0]);
	n*=2;
	struct point *aop;
	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&_opencl_conversion_program,
					"custom_opencl_conversion",
					devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
	if (starpu_opencl_set_kernel_args(&err, &kernel,
					  sizeof(aop), &aop,
					  sizeof(x), &x,
					  sizeof(n), &n,
					  0) != 3)
	{
		STARPU_OPENCL_REPORT_ERROR(err);
		assert(0);
	}
	

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(
				queue,
				kernel,
				1,       /* work_dim */
				NULL,    /* global_work_offset */
				&global, /* global_work_size */
				&local,  /* local_work_size */
				0,       /* num_events_in_wait_list */
				NULL,    /* event_wait_list */
				NULL);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
void cpu_to_opencl_opencl_func(void *buffers[], void *args)
{
	STARPU_SKIP_IF_VALGRIND;

	(void) args;
	int id, devid, ret;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION,
						  &opencl_conversion_program,
						  NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");

	err = starpu_opencl_load_kernel(&kernel,
					&queue,
					&opencl_conversion_program,
					"cpu_to_opencl_opencl",
					devid);

	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(src), &src);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 1, sizeof(dst), &dst);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);


	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel,
						device,
						CL_KERNEL_WORK_GROUP_SIZE,
						sizeof(local),
						&local,
						&s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);

                if (local > global)
			local = global;

		err = clEnqueueNDRangeKernel(queue,
					kernel,
					1,
					NULL,
					&global,
					&local,
					0,
					NULL,
					&event);

		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
        ret = starpu_opencl_unload_opencl(&opencl_conversion_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
}
Пример #26
0
/*
 * cl_update (CUDA version)
 */
static void update_func_cuda(void *descr[], void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
    if (block->bz == 0)
        fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
    else
        DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
#ifdef STARPU_USE_MPI
    int rank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    DEBUG( "!!!           RANK %d              !!!\n", rank);
#endif
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

    unsigned block_size_z = get_block_size(block->bz);
    unsigned i;
    update_per_worker[workerid]++;

    struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks()*1000};
    gettimeofday(&tv, NULL);
    timersub(&tv, &start, &tv2);
    timersub(&tv2, &last_tick[block->bz], &diff);
    while (timercmp(&diff, &delta, >=)) {
        timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
        timersub(&tv2, &last_tick[block->bz], &diff);
        if (who_runs_what_index[block->bz] < who_runs_what_len)
            who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
    }

    if (who_runs_what_index[block->bz] < who_runs_what_len)
        who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);

    /*
     *	Load neighbours' boundaries : TOP
     */

    /* The offset along the z axis is (block_size_z + K) */
    load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
    load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);

    /*
     *	Load neighbours' boundaries : BOTTOM
     */
    load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
    load_subblock_from_buffer_cuda(descr[1], descr[5], 0);

    /*
     *	Stencils ... do the actual work here :) TODO
     */

    for (i=1; i<=K; i++)
    {
        starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
        TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;

        /* Shadow data */
        cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);

        /* And perform actual computation */
#ifdef LIFE
        cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
#else
        cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
#endif /* LIFE */
    }

    cudaError_t cures;
    if ((cures = cudaThreadSynchronize()) != cudaSuccess)
        STARPU_CUDA_REPORT_ERROR(cures);

}
#endif /* STARPU_USE_CUDA */

/*
 * cl_update (CPU version)
 */
static void update_func_cpu(void *descr[], void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
    if (block->bz == 0)
        fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
    else
        DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
#ifdef STARPU_USE_MPI
    int rank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    DEBUG( "!!!           RANK %d            !!!\n", rank);
#endif
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

    unsigned block_size_z = get_block_size(block->bz);
    unsigned i;
    update_per_worker[workerid]++;

    struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks() * 1000};
    gettimeofday(&tv, NULL);
    timersub(&tv, &start, &tv2);
    timersub(&tv2, &last_tick[block->bz], &diff);
    while (timercmp(&diff, &delta, >=)) {
        timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
        timersub(&tv2, &last_tick[block->bz], &diff);
        if (who_runs_what_index[block->bz] < who_runs_what_len)
            who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
    }

    if (who_runs_what_index[block->bz] < who_runs_what_len)
        who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);

    /*
     *	Load neighbours' boundaries : TOP
     */

    /* The offset along the z axis is (block_size_z + K) */
    load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
    load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);

    /*
     *	Load neighbours' boundaries : BOTTOM
     */
    load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
    load_subblock_from_buffer_cpu(descr[1], descr[5], 0);

    /*
     *	Stencils ... do the actual work here :) TODO
     */

    for (i=1; i<=K; i++)
    {
        starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
        TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;

        /* Shadow data */
        unsigned ldy = oldb->ldy, ldz = oldb->ldz;
        unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
        unsigned x, y, z;
        unsigned stepx = 1;
        unsigned stepy = 1;
        unsigned stepz = 1;
        unsigned idx = 0;
        unsigned idy = 0;
        unsigned idz = 0;
        TYPE *ptr = old;

#		include "shadow.h"

        /* And perform actual computation */
#ifdef LIFE
        life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
#else
        memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
#endif /* LIFE */
    }
}

/* Performance model and codelet structure */
static struct starpu_perfmodel_t cl_update_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "cl_update"
};

starpu_codelet cl_update = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = update_func_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = update_func_cuda,
#endif
    .model = &cl_update_model,
    .nbuffers = 6
};

/*
 * Save the block internal boundaries to give them to our neighbours.
 */

/* CPU version */
static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
        starpu_block_interface_t *boundary,
        unsigned firstz)
{
    /* Sanity checks */
    STARPU_ASSERT(block->nx == boundary->nx);
    STARPU_ASSERT(block->ny == boundary->ny);
    STARPU_ASSERT(boundary->nz == K);

    /* NB: this is not fully garanteed ... but it's *very* likely and that
     * makes our life much simpler */
    STARPU_ASSERT(block->ldy == boundary->ldy);
    STARPU_ASSERT(block->ldz == boundary->ldz);

    /* We do a contiguous memory transfer */
    size_t boundary_size = K*block->ldz*block->elemsize;

    unsigned offset = firstz*block->ldz;
    TYPE *block_data = (TYPE *)block->ptr;
    TYPE *boundary_data = (TYPE *)boundary->ptr;
    memcpy(boundary_data, &block_data[offset], boundary_size);
}

/* CUDA version */
#ifdef STARPU_USE_CUDA
static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
        starpu_block_interface_t *boundary,
        unsigned firstz)
{
    /* Sanity checks */
    STARPU_ASSERT(block->nx == boundary->nx);
    STARPU_ASSERT(block->ny == boundary->ny);
    STARPU_ASSERT(boundary->nz == K);

    /* NB: this is not fully garanteed ... but it's *very* likely and that
     * makes our life much simpler */
    STARPU_ASSERT(block->ldy == boundary->ldy);
    STARPU_ASSERT(block->ldz == boundary->ldz);

    /* We do a contiguous memory transfer */
    size_t boundary_size = K*block->ldz*block->elemsize;

    unsigned offset = firstz*block->ldz;
    TYPE *block_data = (TYPE *)block->ptr;
    TYPE *boundary_data = (TYPE *)boundary->ptr;
    cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
}
#endif /* STARPU_USE_CUDA */

/* Record how many top/bottom saves each worker performed */
unsigned top_per_worker[STARPU_NMAXWORKERS];
unsigned bottom_per_worker[STARPU_NMAXWORKERS];

/* top save, CPU version */
static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    top_per_worker[workerid]++;

    DEBUG( "DO SAVE Bottom block %d\n", block->bz);

    /* The offset along the z axis is (block_size_z + K)- K */
    unsigned block_size_z = get_block_size(block->bz);

    load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
    load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
}

/* bottom save, CPU version */
static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    bottom_per_worker[workerid]++;

    DEBUG( "DO SAVE Top block %d\n", block->bz);

    load_subblock_into_buffer_cpu(descr[0], descr[2], K);
    load_subblock_into_buffer_cpu(descr[1], descr[3], K);
}

/* top save, CUDA version */
#ifdef STARPU_USE_CUDA
static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    top_per_worker[workerid]++;

    DEBUG( "DO SAVE Top block %d\n", block->bz);

    /* The offset along the z axis is (block_size_z + K)- K */
    unsigned block_size_z = get_block_size(block->bz);

    load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
    load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
    cudaThreadSynchronize();
}

/* bottom save, CUDA version */
static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    bottom_per_worker[workerid]++;

    DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);

    load_subblock_into_buffer_cuda(descr[0], descr[2], K);
    load_subblock_into_buffer_cuda(descr[1], descr[3], K);
    cudaThreadSynchronize();
}
#endif /* STARPU_USE_CUDA */

/* Performance models and codelet for save */
static struct starpu_perfmodel_t save_cl_bottom_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "save_cl_bottom"
};

static struct starpu_perfmodel_t save_cl_top_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "save_cl_top"
};

starpu_codelet save_cl_bottom = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = dummy_func_bottom_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = dummy_func_bottom_cuda,
#endif
    .model = &save_cl_bottom_model,
    .nbuffers = 4
};

starpu_codelet save_cl_top = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = dummy_func_top_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = dummy_func_top_cuda,
#endif
    .model = &save_cl_top_model,
    .nbuffers = 4
};
Пример #27
0
static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
				    struct _starpu_data_replicate *src_replicate,
				    struct _starpu_data_replicate *dst_replicate,
				    struct _starpu_data_request *req)
{
	unsigned src_node = src_replicate->memory_node;
	unsigned dst_node = dst_replicate->memory_node;

	STARPU_ASSERT(src_replicate->refcnt);
	STARPU_ASSERT(dst_replicate->refcnt);

	STARPU_ASSERT(src_replicate->allocated);
	STARPU_ASSERT(dst_replicate->allocated);

	_starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle));

#ifdef STARPU_SIMGRID
	return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req);
#else /* !SIMGRID */

	int ret = 0;

	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;

	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);

#ifdef STARPU_USE_CUDA
	cudaError_t cures;
	cudaStream_t stream;
#endif

	void *src_interface = src_replicate->data_interface;
	void *dst_interface = dst_replicate->data_interface;

#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
	{
		unsigned devid;
		if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM))
		{
			/* GPU-GPU transfer, issue it from the device we are supposed to drive */
			int worker = starpu_worker_get_id();
			devid = starpu_worker_get_devid(worker);
		}
		else
		{
			unsigned node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			devid = _starpu_memory_node_get_devid(node);
		}
		starpu_cuda_set_device(devid);
	}
#endif

	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
	{
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
		if (copy_methods->ram_to_ram)
			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
		break;
#ifdef STARPU_USE_CUDA
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
		/* only the proper CUBLAS thread can initiate this directly ! */
#if !defined(HAVE_CUDA_MEMCPY_PEER)
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
#endif
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
			if (copy_methods->cuda_to_ram)
				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_local_out_transfer_stream();
			if (copy_methods->cuda_to_ram_async)
				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
		/* STARPU_CPU_RAM -> CUBLAS_RAM */
		/* only the proper CUBLAS thread can initiate this ! */
#if !defined(HAVE_CUDA_MEMCPY_PEER)
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
#endif
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
			if (copy_methods->ram_to_cuda)
				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess))
				STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_local_in_transfer_stream();
			if (copy_methods->ram_to_cuda_async)
				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess))
				STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
		/* CUDA - CUDA transfer */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->cuda_to_cuda)
				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
			if (copy_methods->cuda_to_cuda_async)
				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
		/* OpenCL -> RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->opencl_to_ram)
				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->opencl_to_ram_async)
				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->ram_to_opencl)
				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->ram_to_opencl_async)
				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->opencl_to_opencl)
				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->opencl_to_opencl_async)
				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
#endif
#ifdef STARPU_USE_MIC
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
		/* RAM -> MIC */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
			if (copy_methods->ram_to_mic)
				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_MIC_RAM;
			if (copy_methods->ram_to_mic_async)
				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
		/* MIC -> RAM */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
			if (copy_methods->mic_to_ram)
				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_MIC_RAM;
			if (copy_methods->mic_to_ram_async)
				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
		}
		break;
#endif
#ifdef STARPU_USE_SCC
		/* SCC RAM associated to the master process is considered as
		 * the main memory node. */
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
		/* master private SCC RAM -> slave private SCC RAM */
		if (copy_methods->scc_src_to_sink)
			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
		/* slave private SCC RAM -> master private SCC RAM */
		if (copy_methods->scc_sink_to_src)
			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
		/* slave private SCC RAM -> slave private SCC RAM */
		if (copy_methods->scc_sink_to_sink)
			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
#endif

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
		if(copy_methods->any_to_any)
			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);

		else
		{
			void *obj = starpu_data_handle_to_pointer(handle, dst_node);
			void * ptr = NULL;
			starpu_ssize_t size = 0;
			handle->ops->pack_data(handle, src_node, &ptr, &size);
			ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, &req->async_channel);
			if (ret == 0)
				/* write is already finished, ptr was allocated in pack_data */
				free(ptr);

			/* For now, asynchronous is not supported */
			STARPU_ASSERT(ret == 0);
		}
		break;
		
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
		if(copy_methods->any_to_any) 
			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
		else
		{
			void *obj = starpu_data_handle_to_pointer(handle, src_node);
			void * ptr = NULL;
			size_t size = 0;
			ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, &req->async_channel);
			if (ret == 0)
			{
				/* read is already finished, we can already unpack */
				handle->ops->unpack_data(handle, dst_node, ptr, size); 
				/* ptr is allocated in full_read */
				free(ptr);
			}

			/* For now, asynchronous is not supported */
			STARPU_ASSERT(ret == 0);
		}
		break;

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM):	
		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
		break;
		
	default:
		STARPU_ABORT();
		break;
	}
	
	return ret;
#endif /* !SIMGRID */
}
/* Note: this is not scalable work stealing,  use lws instead */
static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
{
	struct _starpu_work_stealing_data *ws = (struct _starpu_work_stealing_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);

	struct starpu_task *task;
	struct _starpu_deque_jobq *q;

	int workerid = starpu_worker_get_id();

	STARPU_ASSERT(workerid != -1);

	q = ws->queue_array[workerid];

	task = _starpu_deque_pop_task(q, workerid);
	if (task)
	{
		/* there was a local task */
		ws->performed_total++;
		q->nprocessed++;
		q->njobs--;
		return task;
	}
	starpu_pthread_mutex_t *worker_sched_mutex;
	starpu_pthread_cond_t *worker_sched_cond;
	starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond);

	/* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/
	STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex);
       

	/* we need to steal someone's job */
	unsigned victim = select_victim(sched_ctx_id);

	starpu_pthread_mutex_t *victim_sched_mutex;
	starpu_pthread_cond_t *victim_sched_cond;

	starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond);
	STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex);
	struct _starpu_deque_jobq *victimq = ws->queue_array[victim];

	task = _starpu_deque_pop_task(victimq, workerid);
	if (task)
	{
		_STARPU_TRACE_WORK_STEALING(q, workerid);
		ws->performed_total++;

		/* Beware : we have to increase the number of processed tasks of
		 * the stealer, not the victim ! */
		q->nprocessed++;
		victimq->njobs--;
	}

	STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex);

	STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex);
	if(!task)
	{
		task = _starpu_deque_pop_task(q, workerid);
		if (task)
		{
			/* there was a local task */
			ws->performed_total++;
			q->nprocessed++;
			q->njobs--;
			return task;
		}
	}

	return task;
}
Пример #29
0
 */

#include <starpu.h>
#include <starpu_opencl.h>
#include <CL/cl.h>

extern struct starpu_opencl_program opencl_code;

void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
{
        unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
	cl_kernel kernel;
	cl_command_queue queue;
	int id, devid, err;

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incA", devid);
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = 0;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=100;
		size_t local=100;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}