Пример #1
0
void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	while (!async_channel->event.finished)
		STARPU_PTHREAD_COND_WAIT(&async_channel->event.cond, &async_channel->event.mutex);
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
	cudaError_t cures;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;

		cures = cudaEventSynchronize(event);
		if (STARPU_UNLIKELY(cures))
			STARPU_CUDA_REPORT_ERROR(cures);

		cures = cudaEventDestroy(event);
		if (STARPU_UNLIKELY(cures))
			STARPU_CUDA_REPORT_ERROR(cures);

		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int err;
		if ((*async_channel).event.opencl_event == NULL)
			STARPU_ABORT();
		err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		err = clReleaseEvent((*async_channel).event.opencl_event);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
	      break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_MAIN_RAM:
		starpu_disk_wait_request(async_channel);
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}
#endif /* !SIMGRID */
}
Пример #2
0
void *_starpu_htbl_search_32(struct starpu_htbl32_node_s *htbl, uint32_t key)
{
	unsigned currentbit;
	unsigned keysize = 32;

	starpu_htbl32_node_t *current_htbl = htbl;

	/* 000000000001111 with HTBL_NODE_SIZE 1's */
	uint32_t mask = (1<<STARPU_HTBL32_NODE_SIZE)-1;

	for(currentbit = 0; currentbit < keysize; currentbit+=STARPU_HTBL32_NODE_SIZE)
	{
	
	//	printf("search : current bit = %d \n", currentbit);
		if (STARPU_UNLIKELY(current_htbl == NULL))
			return NULL;

		/* 0000000000001111 
		 *     | currentbit
		 * 0000111100000000 = offloaded_mask
		 *         |last_currentbit
		 * */

		unsigned last_currentbit = 
			keysize - (currentbit + STARPU_HTBL32_NODE_SIZE);
		uint32_t offloaded_mask = mask << last_currentbit;
		unsigned current_index = 
			(key & (offloaded_mask)) >> (last_currentbit);

		current_htbl = current_htbl->children[current_index];
	}

	return current_htbl;
}
Пример #3
0
/* the generic interface that call the proper underlying implementation */
int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
{
	struct starpu_task *task = j->task;

	task->status = STARPU_TASK_READY;

	/* in case there is no codelet associated to the task (that's a control
	 * task), we directly execute its callback and enforce the
	 * corresponding dependencies */
	if (task->cl == NULL)
	{
		_starpu_handle_job_termination(j, job_is_already_locked);
		return 0;
	}

	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
	{
		unsigned workerid = task->workerid;
		struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
		
		if (use_prefetch)
		{
			uint32_t memory_node = starpu_worker_get_memory_node(workerid); 
			_starpu_prefetch_task_input_on_node(task, memory_node);
		}

		return _starpu_push_local_task(worker, j);
	}
	else {
		STARPU_ASSERT(policy.push_task);

		return policy.push_task(task);
	}
}
Пример #4
0
unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	unsigned ret;
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	ret = async_channel->event.finished;
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
	return ret;
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
	unsigned success = 0;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;
		cudaError_t cures = cudaEventQuery(event);

		success = (cures == cudaSuccess);
		if (success)
			cudaEventDestroy(event);
		else if (cures != cudaErrorNotReady)
			STARPU_CUDA_REPORT_ERROR(cures);
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int event_status;
		cl_event opencl_event = (*async_channel).event.opencl_event;
		if (opencl_event == NULL) STARPU_ABORT();
		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		if (event_status < 0)
			STARPU_OPENCL_REPORT_ERROR(event_status);
		success = (event_status == CL_COMPLETE);
		break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_DISK_RAM:
		success = starpu_disk_test_request(async_channel);
		break;
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}

	return success;
#endif /* !SIMGRID */
}
Пример #5
0
double _starpu_history_based_job_expected_length(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j)
{
    double exp;
    struct starpu_per_arch_perfmodel_t *per_arch_model;
    struct starpu_history_entry_t *entry;
    struct starpu_htbl32_node_s *history;

    load_history_based_model(model, 1);

    if (STARPU_UNLIKELY(!j->footprint_is_computed))
        _starpu_compute_buffers_footprint(j);

    uint32_t key = j->footprint;

    per_arch_model = &model->per_arch[arch];

    history = per_arch_model->history;
    if (!history)
        return -1.0;

    PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
    entry = _starpu_htbl_search_32(history, key);
    PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);

    exp = entry?entry->mean:-1.0;

    return exp;
}
int starpu_opencl_set_kernel_args(cl_int *error, cl_kernel *kernel, ...)
{
	int i;
	va_list ap;

	va_start(ap, kernel);

	for (i = 0; ; i++)
	{
		int size = va_arg(ap, int);
		if (size == 0)
			break;

		cl_mem *ptr = va_arg(ap, cl_mem *);
		int err = clSetKernelArg(*kernel, i, size, ptr);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
		{
			*error = err;
			break;
		}
	}

	va_end(ap);
	return i;
}
cl_int starpu_opencl_release_kernel(cl_kernel kernel)
{
	cl_int err;

	err = clReleaseKernel(kernel);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);

	return CL_SUCCESS;
}
Пример #8
0
int starpu_event_wait_all(int num_events, starpu_event *events) {
	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
		return -EDEADLK;

   int i;
   for (i=0; i<num_events; i++)
      starpu_event_wait(events[i]);

   return 0;
}
Пример #9
0
int _starpu_opencl_init_context(int devid)
{
#ifdef STARPU_SIMGRID
	int j;
	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
	{
		task_finished[devid][j] = 0;
		STARPU_PTHREAD_MUTEX_INIT(&task_mutex[devid][j], NULL);
		STARPU_PTHREAD_COND_INIT(&task_cond[devid][j], NULL);
	}
#else /* !STARPU_SIMGRID */
	cl_int err;
	cl_uint uint;

	STARPU_PTHREAD_MUTEX_LOCK(&big_lock);

        _STARPU_DEBUG("Initialising context for dev %d\n", devid);

        // Create a compute context
	err = 0;
        contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);
	starpu_malloc_set_align(uint/8);

        // Create execution queue for the given device
        queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        // Create transfer queue for the given device
        cl_command_queue_properties props;
        err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);
        props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
        in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

	STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
#endif /* !STARPU_SIMGRID */
	return 0;
}
Пример #10
0
unsigned _starpu_memory_node_get_local_key(void)
{
	unsigned *memory_node;
	memory_node = (unsigned *) STARPU_PTHREAD_GETSPECIFIC(memory_node_key);

	/* in case this is called by the programmer, we assume the RAM node
	   is the appropriate memory node ... XXX */
	if (STARPU_UNLIKELY(!memory_node))
		return STARPU_MAIN_RAM;

	return *memory_node;
}
Пример #11
0
static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
{
	starpu_ssize_t limit;
	size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
	size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
	char name[30];

#ifdef STARPU_SIMGRID
	totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid);
#elif defined(STARPU_USE_OPENCL)
	/* Request the size of the current device's memory */
	cl_int err;
	cl_ulong size;
	err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);
	totalGlobalMem = size;
#endif

	limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM");
	if (limit == -1)
	{
		sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
		limit = starpu_get_env_number(name);
	}
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
	if (limit == -1)
	{
		/* Use 90% of the available memory by default.  */
		limit = totalGlobalMem / (1024*1024) * 0.9;
	}
#endif

	global_mem[devid] = limit * 1024*1024;

#ifdef STARPU_USE_OPENCL
	/* How much memory to waste ? */
	to_waste = totalGlobalMem - global_mem[devid];
#endif

	_STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
			devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
			(long)(totalGlobalMem - to_waste)/(1024*1024));

}
Пример #12
0
int starpu_event_wait(starpu_event event) {
	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
		return -EDEADLK;

   /* We can avoid mutex locking if event is already complete */
   if (!event->complete) {

      _starpu_event_lock(event);
      
      event->cond_wait_count += 1;

      while (!event->complete) {
         pthread_cond_wait(&event->cond, &event->mutex);
      }

      event->cond_wait_count -= 1;

      _starpu_event_unlock(event);
   }

   return 0;
}
int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
{
	unsigned int dev;
	unsigned int nb_devices;

	if (!starpu_opencl_worker_get_count())
		return 0;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		if (opencl_programs->programs[dev])
		{
			cl_int err;
			err = clReleaseProgram(opencl_programs->programs[dev]);
			if (STARPU_UNLIKELY(err != CL_SUCCESS))
				STARPU_OPENCL_REPORT_ERROR(err);
		}
	}
	return 0;
}
static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
{
/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */

	struct starpu_task *task = create_task(TAG22(k, i, j));

	task->cl = &cl22;

	/* which sub-data is manipulated ? */
	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);

	if (!noprio && (i == k + 1) && (j == k +1) )
	{
		task->priority = STARPU_MAX_PRIO;
	}

	/* enforce dependencies ... */
	if (k > 0)
	{
		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
	}
	else
	{
		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
	}

	int n = starpu_matrix_get_nx(task->handles[0]);
	task->flops = FLOPS_SGEMM(n, n, n);

	int ret = starpu_task_submit(task);
        if (STARPU_UNLIKELY(ret == -ENODEV))
	{
                FPRINTF(stderr, "No worker may execute this task\n");
                exit(0);
        }
}
static
int _starpu_opencl_get_binary_name(char *binary_file_name, size_t maxlen, const char *source_file_name, int dev, cl_device_id device)
{
	char binary_directory[1024];
	char *p;
	cl_int err;
	cl_uint vendor_id;

	_starpu_opencl_create_binary_directory(binary_directory, 1024);

	p = strrchr(source_file_name, '/');
	snprintf(binary_file_name, maxlen, "%s/%s", binary_directory, p?p:source_file_name);

	p = strstr(binary_file_name, ".cl");
	if (p == NULL) p=binary_file_name + strlen(binary_file_name);

	err = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

	sprintf(p, ".%s.vendor_id_%d_device_id_%d", _starpu_opencl_get_device_type_as_string(dev), (int)vendor_id, dev);

	return CL_SUCCESS;
}
cl_int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs,
                                 const char *kernel_name, int devid)
{
	cl_int err;
	cl_device_id device;
	cl_program program;

	starpu_opencl_get_device(devid, &device);
	starpu_opencl_get_queue(devid, queue);

	program = opencl_programs->programs[devid];
	if (!program)
	{
		_STARPU_DISP("Program not available for device <%d>\n", devid);
		return CL_INVALID_PROGRAM;
	}

	// Create the compute kernel in the program we wish to run
	*kernel = clCreateKernel(program, kernel_name, &err);
	if (STARPU_UNLIKELY(err != CL_SUCCESS))
		STARPU_OPENCL_REPORT_ERROR(err);

	return CL_SUCCESS;
}
Пример #17
0
int _starpu_opencl_deinit_context(int devid)
{
#ifdef STARPU_SIMGRID
	int j;
	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
	{
		task_finished[devid][j] = 0;
		STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[devid][j]);
		STARPU_PTHREAD_COND_DESTROY(&task_cond[devid][j]);
	}
#else /* !STARPU_SIMGRID */
        cl_int err;

	STARPU_PTHREAD_MUTEX_LOCK(&big_lock);

        _STARPU_DEBUG("De-initialising context for dev %d\n", devid);

        err = clReleaseContext(contexts[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clReleaseCommandQueue(queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clReleaseCommandQueue(in_transfer_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        err = clReleaseCommandQueue(out_transfer_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
        err = clReleaseCommandQueue(peer_transfer_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        err = clReleaseCommandQueue(alloc_queues[devid]);
        if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

        contexts[devid] = NULL;

	STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
#endif

        return 0;
}
int _starpu_push_task_to_workers(struct starpu_task *task)
{
	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
	unsigned nworkers = 0;

	_STARPU_TRACE_JOB_PUSH(task, task->priority > 0);

	/* if the contexts still does not have workers put the task back to its place in
	   the empty ctx list */
	if(!sched_ctx->is_initial_sched)
	{
		/*if there are workers in the ctx that are not able to execute tasks
		  we consider the ctx empty */
		nworkers = _starpu_nworkers_able_to_execute_task(task, sched_ctx);

		if (nworkers == 0)
		{
			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
#ifdef STARPU_USE_SC_HYPERVISOR
			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL 
			   && sched_ctx->perf_counters->notify_empty_ctx)
			{
				_STARPU_TRACE_HYPERVISOR_BEGIN();
				sched_ctx->perf_counters->notify_empty_ctx(sched_ctx->id, task);
				_STARPU_TRACE_HYPERVISOR_END();
			}
#endif

			return -EAGAIN;
		}
	}

	_starpu_profiling_set_task_push_start_time(task);

	int ret = 0;
	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
	{
		unsigned node = starpu_worker_get_memory_node(task->workerid);
		if (starpu_get_prefetch_flag())
			starpu_prefetch_task_input_on_node(task, node);

		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
	}
	else
	{
		struct _starpu_machine_config *config = _starpu_get_machine_config();

		/* When a task can only be executed on a given arch and we have
		 * only one memory node for that arch, we can systematically
		 * prefetch before the scheduling decision. */
		if (starpu_get_prefetch_flag())
		{
			if (task->cl->where == STARPU_CPU && config->cpus_nodeid >= 0)
				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
			else if (task->cl->where == STARPU_CUDA && config->cuda_nodeid >= 0)
				starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
			else if (task->cl->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
				starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
			else if (task->cl->where == STARPU_MIC && config->mic_nodeid >= 0)
				starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
			else if (task->cl->where == STARPU_SCC && config->scc_nodeid >= 0)
				starpu_prefetch_task_input_on_node(task, config->scc_nodeid);
		}

		if(!sched_ctx->sched_policy)
		{
			/* Note: we have to call that early, or else the task may have
			 * disappeared already */
			starpu_push_task_end(task);
			if(!sched_ctx->awake_workers)
				ret = _starpu_push_task_on_specific_worker(task, sched_ctx->main_master);
			else
			{
				struct starpu_worker_collection *workers = sched_ctx->workers;
				
				struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
				job->task_size = workers->nworkers;
				job->combined_workerid = -1; // workerid; its a ctx not combined worker
				job->active_task_alias_count = 0;

				STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, workers->nworkers);
				STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, workers->nworkers);
				job->after_work_busy_barrier = workers->nworkers;

				unsigned workerid;
				struct starpu_sched_ctx_iterator it;
				if(workers->init_iterator)
					workers->init_iterator(workers, &it);

				while(workers->has_next(workers, &it))
				{
					workerid = workers->get_next(workers, &it);
					struct starpu_task *alias = starpu_task_dup(task);
					alias->destroy = 1;
					ret |= _starpu_push_task_on_specific_worker(alias, workerid);
				}
			}
		}
		else
		{
			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
			/* check out if there are any workers in the context */
			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
			if (nworkers == 0)
				ret = -1;
			else
			{
				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
				ret = sched_ctx->sched_policy->push_task(task);
				_STARPU_TRACE_WORKER_SCHEDULING_POP;
			}
			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
		}

		if(ret == -1)
		{
			fprintf(stderr, "repush task \n");
			_STARPU_TRACE_JOB_POP(task, task->priority > 0);
			ret = _starpu_push_task_to_workers(task);
		}
	}
	/* Note: from here, the task might have been destroyed already! */
	_STARPU_LOG_OUT();
	return ret;

}
Пример #19
0
static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
{
	int ret;

	/* create a new codelet */
	struct starpu_task *entry_task = NULL;

	/* create all the DAG nodes */
	unsigned i,j,k;

	starpu_data_handle_t dataA;

	/* monitor and partition the A matrix into blocks :
	 * one block is now determined by 2 unsigned (i,j) */
	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));

	starpu_data_set_sequential_consistency_flag(dataA, 0);

	struct starpu_data_filter f =
	{
		.filter_func = starpu_matrix_filter_vertical_block,
		.nchildren = nblocks
	};

	struct starpu_data_filter f2 =
	{
		.filter_func = starpu_matrix_filter_block,
		.nchildren = nblocks
	};

	starpu_data_map_filters(dataA, 2, &f, &f2);

	for (k = 0; k < nbigblocks; k++)
	{
		struct starpu_task *task = create_task_11(dataA, k, reclevel);
		/* we defer the launch of the first task */
		if (k == 0)
		{
			entry_task = task;
		}
		else
		{
			ret = starpu_task_submit(task);
			if (ret == -ENODEV) return 77;
			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
		}

		for (j = k+1; j<nblocks; j++)
		{
		     	ret = create_task_21(dataA, k, j, reclevel);
			if (ret == -ENODEV) return 77;

			for (i = k+1; i<nblocks; i++)
			{
				if (i <= j)
				{
				     ret = create_task_22(dataA, k, i, j, reclevel);
				     if (ret == -ENODEV) return 77;
				}
			}
		}
	}

	/* schedule the codelet */
	ret = starpu_task_submit(entry_task);
	if (STARPU_UNLIKELY(ret == -ENODEV))
	{
		FPRINTF(stderr, "No worker may execute this task\n");
		return 77;
	}

	if (nblocks == nbigblocks)
	{
		/* stall the application until the end of computations */
		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
		starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
		starpu_data_unregister(dataA);
		return 0;
	}
	else
	{
		STARPU_ASSERT(reclevel == 0);
		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);

		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
		STARPU_ASSERT(tag_array);

		unsigned ind = 0;
		for (i = nbigblocks; i < nblocks; i++)
		for (j = nbigblocks; j < nblocks; j++)
		{
			if (i <= j)
				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
		}

		starpu_tag_wait_array(ind, tag_array);

		free(tag_array);

		starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
		starpu_data_unregister(dataA);

		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];

		return cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1);
	}
}
static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
{
	double start;
	double end;

	struct starpu_task *entry_task = NULL;

	/* create all the DAG nodes */
	unsigned i,j,k;

	start = starpu_timing_now();

	for (k = 0; k < nblocks; k++)
	{
		struct starpu_task *task = create_task_11(dataA, k);
		/* we defer the launch of the first task */
		if (k == 0)
		{
			entry_task = task;
		}
		else
		{
			int ret = starpu_task_submit(task);
                        if (STARPU_UNLIKELY(ret == -ENODEV))
			{
                                FPRINTF(stderr, "No worker may execute this task\n");
                                exit(0);
                        }

		}

		for (j = k+1; j<nblocks; j++)
		{
			create_task_21(dataA, k, j);

			for (i = k+1; i<nblocks; i++)
			{
				if (i <= j)
					create_task_22(dataA, k, i, j);
			}
		}
	}

	/* schedule the codelet */
	int ret = starpu_task_submit(entry_task);
        if (STARPU_UNLIKELY(ret == -ENODEV))
	{
                FPRINTF(stderr, "No worker may execute this task\n");
                exit(0);
        }


	/* stall the application until the end of computations */
	starpu_tag_wait(TAG11(nblocks-1));

	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);

	end = starpu_timing_now();


	double timing = end - start;

	unsigned n = starpu_matrix_get_nx(dataA);

	double flop = (1.0f*n*n*n)/3.0f;

	PRINTF("# size\tms\tGFlops\n");
	PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f));
}
Пример #21
0
int main(int argc, char **argv)
{
	unsigned i;
        float foo;
	starpu_data_handle float_array_handle;
	starpu_codelet cl;

	starpu_init(NULL);
        if (argc == 2) niter = atoi(argv[1]);
        foo = 0.0f;

	starpu_variable_data_register(&float_array_handle, 0 /* home node */,
                                      (uintptr_t)&foo, sizeof(float));

#ifdef STARPU_USE_OPENCL
        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_codelet.cl", &opencl_code);
#endif

	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
        cl.cpu_func = cpu_codelet;
#ifdef STARPU_USE_CUDA
        cl.cuda_func = cuda_codelet;
#endif
#ifdef STARPU_USE_OPENCL
        cl.opencl_func = opencl_codelet;
#endif
        cl.nbuffers = 1;
        cl.model = NULL;

	for (i = 0; i < niter; i++)
	{
		struct starpu_task *task = starpu_task_create();
                int ret;

		task->cl = &cl;

		task->callback_func = NULL;

		task->buffers[0].handle = float_array_handle;
		task->buffers[0].mode = STARPU_RW;

		ret = starpu_task_submit(task, NULL);
		if (STARPU_UNLIKELY(ret == -ENODEV))
		{
			fprintf(stderr, "No worker may execute this task\n");
			exit(0);
		}
	}

	starpu_task_wait_for_all();

	/* update the array in RAM */
	starpu_data_acquire(float_array_handle, STARPU_R);

	fprintf(stderr, "variable -> %f\n", foo);

	starpu_data_release(float_array_handle);

	starpu_shutdown();

	return 0;
}
Пример #22
0
int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode)
{
	STARPU_ASSERT(handle);

	/* it is forbidden to call this function from a callback or a codelet */
	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
		return -EDEADLK;

	struct user_interaction_wrapper wrapper =
	{
		.handle = handle,
		.node = node,
		.async = async,
		.cond = PTHREAD_COND_INITIALIZER,
		.lock = PTHREAD_MUTEX_INITIALIZER,
		.finished = 0
	};

	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _prefetch_data_on_node, &wrapper))
	{
		/* we can immediately proceed */
		_starpu_fetch_data_on_node(handle, node, mode, async, NULL, NULL);

		/* remove the "lock"/reference */
		if (!async)
		{
			_starpu_spin_lock(&handle->header_lock);
			_starpu_notify_data_dependencies(handle);
			_starpu_spin_unlock(&handle->header_lock);
		}
	}
	else {
		PTHREAD_MUTEX_LOCK(&wrapper.lock);
		while (!wrapper.finished)
			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
	}

	return 0;
}

int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async)
{
	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R);
}

/*
 *	It is possible to specify that a piece of data can be discarded without
 *	impacting the application.
 */
void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important)
{
	_starpu_spin_lock(&handle->header_lock);

	/* first take all the children lock (in order !) */
	unsigned child;
	for (child = 0; child < handle->nchildren; child++)
	{
		/* make sure the intermediate children is advised as well */
		struct starpu_data_state_t *child_handle = &handle->children[child];
		if (child_handle->nchildren > 0)
			starpu_data_advise_as_important(child_handle, is_important);
	}

	handle->is_not_important = !is_important;

	/* now the parent may be used again so we release the lock */
	_starpu_spin_unlock(&handle->header_lock);

}
Пример #23
0
/* The data must be released by calling starpu_data_release later on */
int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode)
{
	STARPU_ASSERT(handle);

	/* it is forbidden to call this function from a callback or a codelet */
	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
		return -EDEADLK;

	struct user_interaction_wrapper wrapper =
	{
		.handle = handle,
		.mode = mode,
		.node = 0, // unused
		.cond = PTHREAD_COND_INITIALIZER,
		.lock = PTHREAD_MUTEX_INITIALIZER,
		.finished = 0
	};

//	_STARPU_DEBUG("TAKE sequential_consistency_mutex starpu_data_acquire\n");
	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
	int sequential_consistency = handle->sequential_consistency;
	if (sequential_consistency)
	{
		wrapper.pre_sync_task = starpu_task_create();

		wrapper.post_sync_task = starpu_task_create();

#ifdef STARPU_USE_FXT
                starpu_job_t job = _starpu_get_job_associated_to_task(wrapper.pre_sync_task);
                job->model_name = "acquire_pre";
                job = _starpu_get_job_associated_to_task(wrapper.post_sync_task);
                job->model_name = "acquire_post";
#endif

		_starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);

		/* TODO detect if this is superflous */
		wrapper.pre_sync_task->synchronous = 1;
		int ret = starpu_task_submit(wrapper.pre_sync_task, NULL);
		STARPU_ASSERT(!ret);
		/* starpu_event event;
         int ret = starpu_task_submit(wrapper.pre_sync_task, &event);
		   STARPU_ASSERT(!ret);
         starpu_event_wait(event);
         starpu_event_release(event);
      */
	}
	else {
		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
	}

	/* we try to get the data, if we do not succeed immediately, we set a
 	* callback function that will be executed automatically when the data is
 	* available again, otherwise we fetch the data directly */
	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode,
			_starpu_data_acquire_continuation, &wrapper))
	{
		/* no one has locked this data yet, so we proceed immediately */
		int ret = _starpu_fetch_data_on_node(handle, 0, mode, 0, NULL, NULL);
		STARPU_ASSERT(!ret);
	}
	else {
		PTHREAD_MUTEX_LOCK(&wrapper.lock);
		while (!wrapper.finished)
			PTHREAD_COND_WAIT(&wrapper.cond, &wrapper.lock);
		PTHREAD_MUTEX_UNLOCK(&wrapper.lock);
	}

	/* At that moment, the caller holds a reference to the piece of data.
	 * We enqueue the "post" sync task in the list associated to the handle
	 * so that it is submitted by the starpu_data_release
	 * function. */
	_starpu_add_post_sync_tasks(wrapper.post_sync_task, handle);

	return 0;
}

/* This function must be called after starpu_data_acquire so that the
 * application release the data */
void starpu_data_release(starpu_data_handle handle)
{
	STARPU_ASSERT(handle);

	/* The application can now release the rw-lock */
	_starpu_release_data_on_node(handle, 0, 0);

	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
	_starpu_unlock_post_sync_tasks(handle);
}
Пример #24
0
static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
				    struct _starpu_data_replicate *src_replicate,
				    struct _starpu_data_replicate *dst_replicate,
				    struct _starpu_data_request *req)
{
	unsigned src_node = src_replicate->memory_node;
	unsigned dst_node = dst_replicate->memory_node;

	STARPU_ASSERT(src_replicate->refcnt);
	STARPU_ASSERT(dst_replicate->refcnt);

	STARPU_ASSERT(src_replicate->allocated);
	STARPU_ASSERT(dst_replicate->allocated);

	_starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle));

#ifdef STARPU_SIMGRID
	return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req);
#else /* !SIMGRID */

	int ret = 0;

	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;

	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);

#ifdef STARPU_USE_CUDA
	cudaError_t cures;
	cudaStream_t stream;
#endif

	void *src_interface = src_replicate->data_interface;
	void *dst_interface = dst_replicate->data_interface;

#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
	{
		unsigned devid;
		if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM))
		{
			/* GPU-GPU transfer, issue it from the device we are supposed to drive */
			int worker = starpu_worker_get_id();
			devid = starpu_worker_get_devid(worker);
		}
		else
		{
			unsigned node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			devid = _starpu_memory_node_get_devid(node);
		}
		starpu_cuda_set_device(devid);
	}
#endif

	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
	{
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
		if (copy_methods->ram_to_ram)
			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
		break;
#ifdef STARPU_USE_CUDA
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
		/* only the proper CUBLAS thread can initiate this directly ! */
#if !defined(HAVE_CUDA_MEMCPY_PEER)
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
#endif
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
			if (copy_methods->cuda_to_ram)
				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_local_out_transfer_stream();
			if (copy_methods->cuda_to_ram_async)
				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
		/* STARPU_CPU_RAM -> CUBLAS_RAM */
		/* only the proper CUBLAS thread can initiate this ! */
#if !defined(HAVE_CUDA_MEMCPY_PEER)
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
#endif
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
			if (copy_methods->ram_to_cuda)
				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess))
				STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_local_in_transfer_stream();
			if (copy_methods->ram_to_cuda_async)
				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess))
				STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
		/* CUDA - CUDA transfer */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->cuda_to_cuda)
				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
			if (copy_methods->cuda_to_cuda_async)
				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
		/* OpenCL -> RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->opencl_to_ram)
				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->opencl_to_ram_async)
				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->ram_to_opencl)
				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->ram_to_opencl_async)
				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->opencl_to_opencl)
				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->opencl_to_opencl_async)
				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
#endif
#ifdef STARPU_USE_MIC
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
		/* RAM -> MIC */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
			if (copy_methods->ram_to_mic)
				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_MIC_RAM;
			if (copy_methods->ram_to_mic_async)
				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
		/* MIC -> RAM */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
			if (copy_methods->mic_to_ram)
				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_MIC_RAM;
			if (copy_methods->mic_to_ram_async)
				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
		}
		break;
#endif
#ifdef STARPU_USE_SCC
		/* SCC RAM associated to the master process is considered as
		 * the main memory node. */
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
		/* master private SCC RAM -> slave private SCC RAM */
		if (copy_methods->scc_src_to_sink)
			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
		/* slave private SCC RAM -> master private SCC RAM */
		if (copy_methods->scc_sink_to_src)
			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
		/* slave private SCC RAM -> slave private SCC RAM */
		if (copy_methods->scc_sink_to_sink)
			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
#endif

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
		if(copy_methods->any_to_any)
			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);

		else
		{
			void *obj = starpu_data_handle_to_pointer(handle, dst_node);
			void * ptr = NULL;
			starpu_ssize_t size = 0;
			handle->ops->pack_data(handle, src_node, &ptr, &size);
			ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, &req->async_channel);
			if (ret == 0)
				/* write is already finished, ptr was allocated in pack_data */
				free(ptr);

			/* For now, asynchronous is not supported */
			STARPU_ASSERT(ret == 0);
		}
		break;
		
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
		if(copy_methods->any_to_any) 
			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
		else
		{
			void *obj = starpu_data_handle_to_pointer(handle, src_node);
			void * ptr = NULL;
			size_t size = 0;
			ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, &req->async_channel);
			if (ret == 0)
			{
				/* read is already finished, we can already unpack */
				handle->ops->unpack_data(handle, dst_node, ptr, size); 
				/* ptr is allocated in full_read */
				free(ptr);
			}

			/* For now, asynchronous is not supported */
			STARPU_ASSERT(ret == 0);
		}
		break;

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM):	
		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
		break;
		
	default:
		STARPU_ABORT();
		break;
	}
	
	return ret;
#endif /* !SIMGRID */
}
static
int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program_source, const char* build_options,
						      struct starpu_opencl_program *opencl_programs, const char* source_file_name)
{
	unsigned int dev;
	unsigned int nb_devices;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		cl_device_id device;
		cl_context   context;
		cl_program   program;
		cl_int       err;

		if (opencl_programs)
			opencl_programs->programs[dev] = NULL;

		starpu_opencl_get_device(dev, &device);
		starpu_opencl_get_context(dev, &context);
		if (context == NULL)
		{
			_STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
			continue;
		}

		// Create the compute program from the source buffer
		program = clCreateProgramWithSource(context, 1, (const char **) &opencl_program_source, NULL, &err);
		if (!program || err != CL_SUCCESS)
		{
			_STARPU_DISP("Error: Failed to load program source with options %s!\n", build_options);
			return EXIT_FAILURE;
		}

		// Build the program executable
		err = clBuildProgram(program, 1, &device, build_options, NULL, NULL);

		// Get the status
		{
			cl_build_status status;
			size_t len;
			static char buffer[4096] = "";

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			if (len > 2)
				_STARPU_DISP("Compilation output\n%s\n", buffer);

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
			if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS)
			{
				_STARPU_DISP("Error: Failed to build program executable!\n");
				_STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status);
				return EXIT_FAILURE;
			}
		}

		// Store program
		if (opencl_programs)
			opencl_programs->programs[dev] = program;
		else
		{
			char binary_file_name[1024];
			char *binary;
			size_t binary_len;
			FILE *fh;

			err = _starpu_opencl_get_binary_name(binary_file_name, 1024, source_file_name, dev, device);
			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

			err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_len, NULL);
			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			binary = malloc(binary_len);

			err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binary), &binary, NULL);
			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);

			fh = fopen(binary_file_name, "w");
			if (fh == NULL)
			{
				_STARPU_DISP("Error: Failed to open file <%s>\n", binary_file_name);
				perror("fopen");
				return EXIT_FAILURE;
			}
			fwrite(binary, binary_len, 1, fh);
			fclose(fh);
			free(binary);
			_STARPU_DEBUG("File <%s> created\n", binary_file_name);
		}
	}
	return EXIT_SUCCESS;
}
int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs)
{
	unsigned int dev;
	unsigned int nb_devices;

	nb_devices = _starpu_opencl_get_device_count();
	// Iterate over each device
	for(dev = 0; dev < nb_devices; dev ++)
	{
		cl_device_id device;
		cl_context   context;
		cl_program   program;
		cl_int       err;
		char        *binary;
		char         binary_file_name[1024];
		size_t       length;
		cl_int       binary_status;

		opencl_programs->programs[dev] = NULL;

		starpu_opencl_get_device(dev, &device);
		starpu_opencl_get_context(dev, &context);
		if (context == NULL)
		{
			_STARPU_DEBUG("[%u] is not a valid OpenCL context\n", dev);
			continue;
		}

		// Load the binary buffer
		err = _starpu_opencl_get_binary_name(binary_file_name, 1024, kernel_id, dev, device);
		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
		binary = _starpu_opencl_load_program_binary(binary_file_name, &length);

		// Create the compute program from the binary buffer
		program = clCreateProgramWithBinary(context, 1, &device, &length, (const unsigned char **) &binary, &binary_status, &err);
		if (!program || err != CL_SUCCESS)
		{
			_STARPU_DISP("Error: Failed to load program binary!\n");
			return EXIT_FAILURE;
		}

		// Build the program executable
		err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

		// Get the status
		{
			cl_build_status status;
			size_t len;
			static char buffer[4096] = "";

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
			if (len > 2)
				_STARPU_DISP("Compilation output\n%s\n", buffer);

			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
			if (err != CL_SUCCESS || status != CL_BUILD_SUCCESS)
			{
				_STARPU_DISP("Error: Failed to build program executable!\n");
				_STARPU_DISP("clBuildProgram: %d - clGetProgramBuildInfo: %d\n", err, status);
				return EXIT_FAILURE;
			}
		}

		// Store program
		opencl_programs->programs[dev] = program;
	}
	return 0;
}
Пример #27
0
int main(int argc, char **argv)
{
	int ret;
	unsigned part;
	double timing;
	double start, end;
	unsigned row, pos;
	unsigned ind;

	/* CSR matrix description */
	float *nzval;
	uint32_t nnz;
	uint32_t *colind;
	uint32_t *rowptr;
	
	/* Input and Output vectors */
	float *vector_in_ptr;
	float *vector_out_ptr;

	/*
	 *	Parse command-line arguments
	 */
	parse_args(argc, argv);

	/*
	 *	Launch StarPU
	 */
	ret = starpu_init(NULL);
	if (ret == -ENODEV)
		return 77;
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	/*
	 *	Create a 3-band sparse matrix as input example
	 */
	nnz = 3*size-2;
	starpu_malloc((void **)&nzval, nnz*sizeof(float));
	starpu_malloc((void **)&colind, nnz*sizeof(uint32_t));
	starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t));
	assert(nzval && colind && rowptr);

	/* fill the matrix */
	for (row = 0, pos = 0; row < size; row++)
	{
		rowptr[row] = pos;

		if (row > 0)
		{
			nzval[pos] = 1.0f;
			colind[pos] = row-1;
			pos++;
		}
		
		nzval[pos] = 5.0f;
		colind[pos] = row;
		pos++;

		if (row < size - 1)
		{
			nzval[pos] = 1.0f;
			colind[pos] = row+1;
			pos++;
		}
	}

	STARPU_ASSERT(pos == nnz);

	rowptr[size] = nnz;
	
	/* initiate the 2 vectors */
	starpu_malloc((void **)&vector_in_ptr, size*sizeof(float));
	starpu_malloc((void **)&vector_out_ptr, size*sizeof(float));
	assert(vector_in_ptr && vector_out_ptr);

	/* fill them */
	for (ind = 0; ind < size; ind++)
	{
		vector_in_ptr[ind] = 2.0f;
		vector_out_ptr[ind] = 0.0f;
	}

	/*
	 *	Register the CSR matrix and the 2 vectors
	 */
	starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
	starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float));
	starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float));

	/*
	 *	Partition the CSR matrix and the output vector
	 */
	csr_f.nchildren = nblocks;
	vector_f.nchildren = nblocks;
	starpu_data_partition(sparse_matrix, &csr_f);
	starpu_data_partition(vector_out, &vector_f);

	/*
	 *	If we use OpenCL, we need to compile the SpMV kernel
	 */
#ifdef STARPU_USE_OPENCL
	compile_spmv_opencl_kernel();
#endif

	start = starpu_timing_now();

	/*
	 *	Create and submit StarPU tasks
	 */
	for (part = 0; part < nblocks; part++)
	{
		struct starpu_task *task = starpu_task_create();
		task->cl = &spmv_cl;
	
		task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part);
		task->handles[1] = vector_in;
		task->handles[2] = starpu_data_get_sub_data(vector_out, 1, part);
	
		ret = starpu_task_submit(task);
		if (STARPU_UNLIKELY(ret == -ENODEV))
		{
			FPRINTF(stderr, "No worker may execute this task\n");
			exit(0);
		}
	}

	starpu_task_wait_for_all();
	end = starpu_timing_now();

	/*
	 *	Unregister the CSR matrix and the output vector
	 */
	starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM);
	starpu_data_unpartition(vector_out, STARPU_MAIN_RAM);

	/*
	 *	Unregister data
	 */
	starpu_data_unregister(sparse_matrix);
	starpu_data_unregister(vector_in);
	starpu_data_unregister(vector_out);

	/*
	 *	Display the result
	 */
	for (row = 0; row < STARPU_MIN(size, 16); row++)
	{
                FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
	}

	starpu_free(nzval);
	starpu_free(colind);
	starpu_free(rowptr);
	starpu_free(vector_in_ptr);
	starpu_free(vector_out_ptr);

	/*
	 *	Stop StarPU
	 */
	starpu_shutdown();

	timing = end - start;
	FPRINTF(stderr, "Computation took (in ms)\n");
	FPRINTF(stdout, "%2.2f\n", timing/1000);

	return 0;
}