Exemplo n.º 1
0
unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	unsigned ret;
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	ret = async_channel->event.finished;
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
	return ret;
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
	unsigned success = 0;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;
		cudaError_t cures = cudaEventQuery(event);

		success = (cures == cudaSuccess);
		if (success)
			cudaEventDestroy(event);
		else if (cures != cudaErrorNotReady)
			STARPU_CUDA_REPORT_ERROR(cures);
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int event_status;
		cl_event opencl_event = (*async_channel).event.opencl_event;
		if (opencl_event == NULL) STARPU_ABORT();
		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		if (event_status < 0)
			STARPU_OPENCL_REPORT_ERROR(event_status);
		success = (event_status == CL_COMPLETE);
		break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_DISK_RAM:
		success = starpu_disk_test_request(async_channel);
		break;
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}

	return success;
#endif /* !SIMGRID */
}
Exemplo n.º 2
0
void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
{
#ifdef STARPU_SIMGRID
	STARPU_PTHREAD_MUTEX_LOCK(&async_channel->event.mutex);
	while (!async_channel->event.finished)
		STARPU_PTHREAD_COND_WAIT(&async_channel->event.cond, &async_channel->event.mutex);
	STARPU_PTHREAD_MUTEX_UNLOCK(&async_channel->event.mutex);
#else /* !SIMGRID */
	enum starpu_node_kind kind = async_channel->type;
#ifdef STARPU_USE_CUDA
	cudaEvent_t event;
	cudaError_t cures;
#endif

	switch (kind)
	{
#ifdef STARPU_USE_CUDA
	case STARPU_CUDA_RAM:
		event = (*async_channel).event.cuda_event;

		cures = cudaEventSynchronize(event);
		if (STARPU_UNLIKELY(cures))
			STARPU_CUDA_REPORT_ERROR(cures);

		cures = cudaEventDestroy(event);
		if (STARPU_UNLIKELY(cures))
			STARPU_CUDA_REPORT_ERROR(cures);

		break;
#endif
#ifdef STARPU_USE_OPENCL
	case STARPU_OPENCL_RAM:
	{
		cl_int err;
		if ((*async_channel).event.opencl_event == NULL)
			STARPU_ABORT();
		err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
		err = clReleaseEvent((*async_channel).event.opencl_event);
		if (STARPU_UNLIKELY(err != CL_SUCCESS))
			STARPU_OPENCL_REPORT_ERROR(err);
	      break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
		break;
#endif
	case STARPU_MAIN_RAM:
		starpu_disk_wait_request(async_channel);
	case STARPU_CPU_RAM:
	default:
		STARPU_ABORT();
	}
#endif /* !SIMGRID */
}
Exemplo n.º 3
0
void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen)
{
    if (arch == STARPU_CPU_DEFAULT)
    {
        snprintf(archname, maxlen, "cpu");
    }
    else if ((STARPU_CUDA_DEFAULT <= arch)
             && (arch < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS))
    {
        int devid = arch - STARPU_CUDA_DEFAULT;
        snprintf(archname, maxlen, "cuda_%d", devid);
    }
    else if ((STARPU_OPENCL_DEFAULT <= arch)
             && (arch < STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS))
    {
        int devid = arch - STARPU_OPENCL_DEFAULT;
        snprintf(archname, maxlen, "opencl_%d", devid);
    }
    else if (arch == STARPU_GORDON_DEFAULT)
    {
        snprintf(archname, maxlen, "gordon");
    }
    else
    {
        STARPU_ABORT();
    }
}
static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	/* printf("22\n"); */
	float *block 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *in 	= (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	float *out 	= (float *)STARPU_VECTOR_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);

	switch (s)
	{
		case 0:
			cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Exemplo n.º 5
0
int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
{
	/* TODO */
	STARPU_ABORT();

	return 0;
}
Exemplo n.º 6
0
/* R(z) = R(z+d) = local, just call the save kernel */
static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
{
	struct starpu_task *save_task = starpu_task_create();
	struct block_description *descr = get_block_description(z);

	save_task->cl = (dir == -1)?&save_cl_bottom:&save_cl_top;
	save_task->cl_arg = descr;

	/* Saving our border... */
	save_task->handles[0] = descr->layers_handle[0];
	save_task->handles[1] = descr->layers_handle[1];

	/* ... to the neighbour's copy */
	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
	save_task->handles[2] = neighbour->boundaries_handle[(1-dir)/2][0];
	save_task->handles[3] = neighbour->boundaries_handle[(1-dir)/2][1];

	/* Bind */
	if (iter <= BIND_LAST)
		save_task->execute_on_a_specific_worker = get_bind_tasks();
	save_task->workerid = descr->preferred_worker;

	int ret = starpu_task_submit(save_task);
	if (ret)
	{
		FPRINTF(stderr, "Could not submit task save: %d\n", ret);
		STARPU_ABORT();
	}
}
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
{
	float *sub11;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 

	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);

	unsigned long z;

	switch (s)
	{
		case 0:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				pivot = sub11[z+z*ld];
				STARPU_ASSERT(pivot != 0.0f);
		
				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
		
				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
						&sub11[z+(z+1)*ld], ld,
						&sub11[(z+1)+z*ld], 1,
						&sub11[(z+1) + (z+1)*ld],ld);
			}
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
				cudaStreamSynchronize(starpu_cuda_get_local_stream());

				STARPU_ASSERT(pivot != 0.0f);
				
				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
				
				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
								&sub11[z+(z+1)*ld], ld,
								&sub11[(z+1)+z*ld], 1,
								&sub11[(z+1) + (z+1)*ld],ld);
			}

			cudaStreamSynchronize(starpu_cuda_get_local_stream());

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Exemplo n.º 8
0
int _starpu_mkpath(const char *s, mode_t mode)
{
	char *q, *r = NULL, *path = NULL, *up = NULL;
	int rv;

	rv = -1;
	if (strcmp(s, ".") == 0 || strcmp(s, "/") == 0
#ifdef __MINGW32__
		/* C:/ or C:\ */
		|| (s[0] && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && !s[3])
#endif
		)
		return 0;

	if ((path = strdup(s)) == NULL)
		STARPU_ABORT();

	if ((q = strdup(s)) == NULL)
		STARPU_ABORT();

	if ((r = dirname(q)) == NULL)
		goto out;

	if ((up = strdup(r)) == NULL)
		STARPU_ABORT();

	if ((_starpu_mkpath(up, mode) == -1) && (errno != EEXIST))
		goto out;

	if ((mkdir(path, mode) == -1) && (errno != EEXIST))
		rv = -1;
	else 
		rv = 0;
	
out:
	if (up)
		free(up);

	free(q);
	free(path);
	return rv;
}
Exemplo n.º 9
0
static starpu_mic_kernel_t mic_cpy_func()
{
#ifdef STARPU_USE_MIC
	static starpu_mic_func_symbol_t mic_symbol = NULL;
	if (mic_symbol == NULL)
		starpu_mic_register_kernel(&mic_symbol, "mp_cpy_kernel");

	return starpu_mic_get_kernel(mic_symbol);
#else
	STARPU_ABORT();
	return NULL;
#endif
}
Exemplo n.º 10
0
static void common_data_cpy_func(void *descr[], void *cl_arg)
{
	unsigned interface_id = *(unsigned *)cl_arg;

	const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id);
	const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods;

	int workerid = starpu_worker_get_id();
	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
	unsigned memory_node = starpu_worker_get_memory_node(workerid);

	void *dst_interface = descr[0];
	void *src_interface = descr[1];

	switch (type)
	{
		case STARPU_CPU_WORKER:
			if (copy_methods->ram_to_ram)
			{
				copy_methods->ram_to_ram(src_interface, memory_node, dst_interface, memory_node);
				return;
			}
			break;
		case STARPU_CUDA_WORKER:
			if (copy_methods->cuda_to_cuda)
			{
				copy_methods->cuda_to_cuda(src_interface, memory_node, dst_interface, memory_node);
				return;
			}
			break;
		case STARPU_OPENCL_WORKER:
			if (copy_methods->opencl_to_opencl)
			{
				copy_methods->opencl_to_opencl(src_interface, memory_node, dst_interface, memory_node);
				return;
			}
			break;
		default:
			/* unknown architecture */
			STARPU_ABORT();
	}
	STARPU_ASSERT(copy_methods->any_to_any);
	copy_methods->any_to_any(src_interface, memory_node, dst_interface, memory_node, NULL);

}
Exemplo n.º 11
0
void mp_cpy_kernel(void *descr[], void *cl_arg)
{
	unsigned interface_id = *(unsigned *)cl_arg;

	const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id);
	const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods;
	
	void *dst_interface = descr[0];
	void *src_interface = descr[1];

	if(copy_methods->ram_to_ram)
		copy_methods->ram_to_ram(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM);
	else if(copy_methods->any_to_any)
		copy_methods->any_to_any(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM, NULL);
	else
		STARPU_ABORT();

}
void create_task_update(unsigned iter, unsigned z, int local_rank)
{
	STARPU_ASSERT(iter != 0);

	struct starpu_task *task = starpu_task_create();

	unsigned niter = get_niter();

	/* We are going to synchronize with the last tasks */
	if (iter == niter)
	{
		task->use_tag = 1;
		task->tag_id = TAG_FINISH(z);
	}

	unsigned old_layer = (K*(iter-1)) % 2;
	unsigned new_layer = (old_layer + 1) % 2;

	struct block_description *descr = get_block_description(z);
	task->handles[0] = descr->layers_handle[new_layer];
	task->handles[1] = descr->layers_handle[old_layer];

	task->handles[2] = descr->boundaries_handle[T][new_layer];
	task->handles[3] = descr->boundaries_handle[T][old_layer];

	task->handles[4] = descr->boundaries_handle[B][new_layer];
	task->handles[5] = descr->boundaries_handle[B][old_layer];

	task->cl = &cl_update;
	task->cl_arg = descr;

	if (iter <= BIND_LAST)
		task->execute_on_a_specific_worker = get_bind_tasks();
	task->workerid = descr->preferred_worker;

	int ret = starpu_task_submit(task);
	if (ret)
	{
		FPRINTF(stderr, "Could not submit task update block: %d\n", ret);
		if (ret == -ENODEV)
			exit(77);
		STARPU_ABORT();
	}
}
static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);

#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	switch (s)
	{
		case 0:
			STARPU_SGEMM("N", "N",	dy, dx, dz, 
				-1.0f, left, ld21, right, ld12,
					     1.0f, center, ld22);
			break;

#ifdef STARPU_USE_CUDA
		case 1:
			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
					right, ld12, 1.0f, center, ld22);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *sub11;
	float *sub12;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);	
	sub12 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);

	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);

	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
	
#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	/* solve L11 U12 = A12 (find U12) */
	switch (s)
	{
		case 0:
			STARPU_STRSM("L", "L", "N", "N",
					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
					1.0f, sub11, ld11, sub12, ld12);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Exemplo n.º 15
0
void create_start_task(int z, int dir)
{
	/* Dumb task depending on the init task and simulating writing the
	   neighbour buffers, to avoid communications and computation running
	   before we start measuring time */
	struct starpu_task *wait_init = starpu_task_create();
	struct block_description *descr = get_block_description(z);
	starpu_tag_t tag_init = TAG_INIT_TASK;
	wait_init->cl = &null;
	wait_init->use_tag = 1;
	wait_init->tag_id = TAG_START(z, dir);
	wait_init->handles[0] = descr->boundaries_handle[(1 + dir) / 2][0];
	wait_init->handles[1] = descr->boundaries_handle[(1 + dir) / 2][1];
	starpu_tag_declare_deps_array(wait_init->tag_id, 1, &tag_init);

	int ret = starpu_task_submit(wait_init);
	if (ret)
	{
		FPRINTF(stderr, "Could not submit task initial wait: %d\n", ret);
		STARPU_ABORT();
	}
}
static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *sub11;
	float *sub21;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);

	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);

	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
	
#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	switch (s)
	{
		case 0:
			STARPU_STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Exemplo n.º 17
0
/*
 * Schedule saving boundaries of blocks to communication buffers
 */
void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
{
	int node_z = get_block_mpi_node(z);
	int node_z_and_d = get_block_mpi_node(z+dir);

#ifdef STARPU_USE_MPI
	if (node_z == local_rank)
	{
		/* Save data from update */
		create_task_save_local(iter, z, dir, local_rank);
		if (node_z_and_d != local_rank)
		{
			/* R(z) = local & R(z+d) != local, We have to send the data */
			create_task_save_mpi_send(iter, z, dir, local_rank);
		}

	}
	else
	{	/* node_z != local_rank, this MPI node doesn't have the saved data */
		if (node_z_and_d == local_rank)
		{
			create_task_save_mpi_recv(iter, z, dir, local_rank);
		}
		else
		{
			/* R(z) != local & R(z+d) != local We don't have
			   the saved data and don't need it, we shouldn't
			   even have been called! */
			STARPU_ABORT();
		}
	}
#else /* !STARPU_USE_MPI */
	STARPU_ASSERT((node_z == local_rank) && (node_z_and_d == local_rank));
	create_task_save_local(iter, z, dir, local_rank);
#endif /* STARPU_USE_MPI */
}
Exemplo n.º 18
0
/* Note: in case of a tag, it must be already locked */
void _starpu_notify_cg(struct _starpu_cg *cg)
{
	STARPU_ASSERT(cg);
	unsigned remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);

	if (remaining == 0)
	{
		cg->remaining = cg->ntags;

		struct _starpu_tag *tag;
		struct _starpu_cg_list *tag_successors, *job_successors;
		struct _starpu_job *j;

		/* the group is now completed */
		switch (cg->cg_type)
		{
			case STARPU_CG_APPS:
			{
				/* this is a cg for an application waiting on a set of
				 * tags, wake the thread */
				STARPU_PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
				cg->succ.succ_apps.completed = 1;
				STARPU_PTHREAD_COND_SIGNAL(&cg->succ.succ_apps.cg_cond);
				STARPU_PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
				break;
			}

			case STARPU_CG_TAG:
			{
				tag = cg->succ.tag;
				tag_successors = &tag->tag_successors;

				tag_successors->ndeps_completed++;

				/* Note: the tag is already locked by the
				 * caller. */
				if ((tag->state == STARPU_BLOCKED) &&
					(tag_successors->ndeps == tag_successors->ndeps_completed))
				{
					/* reset the counter so that we can reuse the completion group */
					tag_successors->ndeps_completed = 0;
					_starpu_tag_set_ready(tag);
				}
				break;
			}

 		        case STARPU_CG_TASK:
			{
				j = cg->succ.job;

				STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);

				job_successors = &j->job_successors;

				unsigned ndeps_completed =
					STARPU_ATOMIC_ADD(&job_successors->ndeps_completed, 1);

				STARPU_ASSERT(job_successors->ndeps >= ndeps_completed);

				/* Need to atomically test submitted and check
				 * dependencies, since this is concurrent with
				 * _starpu_submit_job */
				if (j->submitted && job_successors->ndeps == ndeps_completed &&
					j->task->status == STARPU_TASK_BLOCKED_ON_TASK)
				{
					/* That task has already passed tag checks,
					 * do not do them again since the tag has been cleared! */
					_starpu_enforce_deps_starting_from_task(j);
				}
				else
					STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);


				break;
			}

			default:
				STARPU_ABORT();
		}
	}
}
Exemplo n.º 19
0
/* This should either return 0 if the transfer is complete, or -EAGAIN if the
 * transfer is still pending, and will have to be waited for by
 * _starpu_driver_test_request_completion/_starpu_driver_wait_request_completion
 */
int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
{
	struct _starpu_async_channel *async_channel = async_data;

	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);

	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
	{
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
		memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
		return 0;

#ifdef STARPU_USE_CUDA
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
		return starpu_cuda_copy_async_sync(
				(void*) (src + src_offset), src_node,
				(void*) (dst + dst_offset), dst_node,
				size,
				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
				cudaMemcpyDeviceToHost);

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
		return starpu_cuda_copy_async_sync(
				(void*) (src + src_offset), src_node,
				(void*) (dst + dst_offset), dst_node,
				size,
				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
				cudaMemcpyHostToDevice);

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
		return starpu_cuda_copy_async_sync(
				(void*) (src + src_offset), src_node,
				(void*) (dst + dst_offset), dst_node,
				size,
				async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
				cudaMemcpyDeviceToDevice);

#endif
#ifdef STARPU_USE_OPENCL
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
		return starpu_opencl_copy_async_sync(
				src, src_offset, src_node,
				dst, dst_offset, dst_node,
				size,
				&async_channel->event.opencl_event);
#endif
#ifdef STARPU_USE_MIC
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
		if (async_data)
			return _starpu_mic_copy_mic_to_ram_async(
					(void*) (src + src_offset), src_node,
					(void*) (dst + dst_offset), dst_node,
					size);
		else
			return _starpu_mic_copy_mic_to_ram(
					(void*) (src + src_offset), src_node,
					(void*) (dst + dst_offset), dst_node,
					size);
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
		if (async_data)
			return _starpu_mic_copy_ram_to_mic_async(
					(void*) (src + src_offset), src_node,
					(void*) (dst + dst_offset), dst_node,
					size);
		else
			return _starpu_mic_copy_ram_to_mic(
					(void*) (src + src_offset), src_node,
					(void*) (dst + dst_offset), dst_node,
					size);
#endif
#ifdef STARPU_USE_SCC
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
		return _starpu_scc_copy_sink_to_src(
				(void*) (src + src_offset), src_node,
				(void*) (dst + dst_offset), dst_node,
				size);
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
		return _starpu_scc_copy_src_to_sink(
				(void*) (src + src_offset), src_node,
				(void*) (dst + dst_offset), dst_node,
				size);
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
		return _starpu_scc_copy_sink_to_sink(
				(void*) (src + src_offset), src_node,
				(void*) (dst + dst_offset), dst_node,
				size);
#endif
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_DISK_RAM):
	{
		return _starpu_disk_copy_src_to_disk(
			(void*) (src + src_offset), src_node,
			(void*) dst, dst_offset, dst_node,
			size, async_channel);
	}
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_CPU_RAM):
		return _starpu_disk_copy_disk_to_src(
			(void*) src, src_offset, src_node,
			(void*) (dst + dst_offset), dst_node,
			size, async_channel);

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_DISK_RAM):
		return _starpu_disk_copy_disk_to_disk(
			(void*) src, src_offset, src_node,
			(void*) dst, dst_offset, dst_node,
			size, async_channel);

	default:
		STARPU_ABORT();
		return -1;
	}
	return 0;
}
Exemplo n.º 20
0
static void init_problem_data(void)
{
	unsigned i,j;

#ifdef STARPU_USE_CUDA
	if (pin) {
		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float));
		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float));
		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float));
	} else
#endif
	{
#ifdef STARPU_HAVE_POSIX_MEMALIGN
		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
#else
		A = malloc(zdim*ydim*sizeof(float));
		B = malloc(xdim*zdim*sizeof(float));
		C = malloc(xdim*ydim*sizeof(float));
#endif
	}

	/* fill the A and B matrices */
	if (norandom) {
		for (j=0; j < ydim; j++) {
			for (i=0; i < zdim; i++) {
				A[j+i*ydim] = (float)(i);
			}
		}
	
		for (j=0; j < zdim; j++) {
			for (i=0; i < xdim; i++) {
				B[j+i*zdim] = (float)(j);
			}
		}
	} 
	else {
#ifdef NORANDOM
		srand(2008);
		STARPU_ABORT();
#endif
		for (j=0; j < ydim; j++) {
			for (i=0; i < zdim; i++) {
				A[j+i*ydim] = (float)(starpu_drand48());
			}
		}
	
		for (j=0; j < zdim; j++) {
			for (i=0; i < xdim; i++) {
				B[j+i*zdim] = (float)(starpu_drand48());
			}
		}
	}

	for (j=0; j < ydim; j++) {
		for (i=0; i < xdim; i++) {
			C[j+i*ydim] = (float)(0);
		}
	}

	display_memory_consumption();
}
Exemplo n.º 21
0
static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_node, 
						unsigned dst_node)
{
	unsigned i;
	unsigned last = 0;
	unsigned cnt;
	int ret;

	if (handle->nchildren == 0)
	{
		/* this is a leaf */
		switch(handle->per_node[src_node].state) {
		case STARPU_OWNER:
			/* the local node has the only copy */
			/* the owner is now the destination_node */
			handle->per_node[src_node].state = STARPU_INVALID;
			handle->per_node[dst_node].state = STARPU_OWNER;

//FIXME: we should use requests during memory reclaim
			/* TODO use request !! */
			handle->per_node[src_node].refcnt++;
			handle->per_node[dst_node].refcnt++;

			ret = _starpu_driver_copy_data_1_to_1(handle, src_node, handle, dst_node, 0, NULL, 1);
			STARPU_ASSERT(ret == 0);

			handle->per_node[src_node].refcnt--;
			handle->per_node[dst_node].refcnt--;

			break;
		case STARPU_SHARED:
			/* some other node may have the copy */
			handle->per_node[src_node].state = STARPU_INVALID;

			/* count the number of copies */
			cnt = 0;
			for (i = 0; i < STARPU_MAXNODES; i++)
			{
				if (handle->per_node[i].state == STARPU_SHARED) {
					cnt++; 
					last = i;
				}
			}

			if (cnt == 1)
				handle->per_node[last].state = STARPU_OWNER;

			break;
		case STARPU_INVALID:
			/* nothing to be done */
			break;
		default:
			STARPU_ABORT();
			break;
		}
	}
	else {
		/* lock all sub-subtrees children */
		unsigned child;
		for (child = 0; child < handle->nchildren; child++)
		{
			transfer_subtree_to_node(&handle->children[child],
							src_node, dst_node);
		}
	}
}
Exemplo n.º 22
0
static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
				    struct _starpu_data_replicate *src_replicate,
				    struct _starpu_data_replicate *dst_replicate,
				    struct _starpu_data_request *req)
{
	unsigned src_node = src_replicate->memory_node;
	unsigned dst_node = dst_replicate->memory_node;

	STARPU_ASSERT(src_replicate->refcnt);
	STARPU_ASSERT(dst_replicate->refcnt);

	STARPU_ASSERT(src_replicate->allocated);
	STARPU_ASSERT(dst_replicate->allocated);

	_starpu_comm_amounts_inc(src_node, dst_node, handle->ops->get_size(handle));

#ifdef STARPU_SIMGRID
	return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req);
#else /* !SIMGRID */

	int ret = 0;

	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;

	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);

#ifdef STARPU_USE_CUDA
	cudaError_t cures;
	cudaStream_t stream;
#endif

	void *src_interface = src_replicate->data_interface;
	void *dst_interface = dst_replicate->data_interface;

#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
	{
		unsigned devid;
		if ((src_kind == STARPU_CUDA_RAM) && (dst_kind == STARPU_CUDA_RAM))
		{
			/* GPU-GPU transfer, issue it from the device we are supposed to drive */
			int worker = starpu_worker_get_id();
			devid = starpu_worker_get_devid(worker);
		}
		else
		{
			unsigned node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			devid = _starpu_memory_node_get_devid(node);
		}
		starpu_cuda_set_device(devid);
	}
#endif

	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
	{
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
		if (copy_methods->ram_to_ram)
			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
		break;
#ifdef STARPU_USE_CUDA
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
		/* only the proper CUBLAS thread can initiate this directly ! */
#if !defined(HAVE_CUDA_MEMCPY_PEER)
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
#endif
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
			if (copy_methods->cuda_to_ram)
				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_local_out_transfer_stream();
			if (copy_methods->cuda_to_ram_async)
				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
		/* STARPU_CPU_RAM -> CUBLAS_RAM */
		/* only the proper CUBLAS thread can initiate this ! */
#if !defined(HAVE_CUDA_MEMCPY_PEER)
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
#endif
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
			if (copy_methods->ram_to_cuda)
				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess))
				STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_local_in_transfer_stream();
			if (copy_methods->ram_to_cuda_async)
				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess))
				STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
		/* CUDA - CUDA transfer */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->cuda_to_cuda)
				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_CUDA_RAM;
			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);

			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
			if (copy_methods->cuda_to_cuda_async)
				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}

			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
		}
		break;
#endif
#ifdef STARPU_USE_OPENCL
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
		/* OpenCL -> RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->opencl_to_ram)
				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->opencl_to_ram_async)
				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->ram_to_opencl)
				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->ram_to_opencl_async)
				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
		{
			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
			/* this is not associated to a request so it's synchronous */
			if (copy_methods->opencl_to_opencl)
				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_OPENCL_RAM;
			if (copy_methods->opencl_to_opencl_async)
				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
		}
		break;
#endif
#ifdef STARPU_USE_MIC
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
		/* RAM -> MIC */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
			if (copy_methods->ram_to_mic)
				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_MIC_RAM;
			if (copy_methods->ram_to_mic_async)
				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
		}
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
		/* MIC -> RAM */
		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
		{
			/* this is not associated to a request so it's synchronous */
			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
			if (copy_methods->mic_to_ram)
				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
			else
				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		}
		else
		{
			req->async_channel.type = STARPU_MIC_RAM;
			if (copy_methods->mic_to_ram_async)
				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
			else
			{
				STARPU_ASSERT(copy_methods->any_to_any);
				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			}
			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
		}
		break;
#endif
#ifdef STARPU_USE_SCC
		/* SCC RAM associated to the master process is considered as
		 * the main memory node. */
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
		/* master private SCC RAM -> slave private SCC RAM */
		if (copy_methods->scc_src_to_sink)
			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
		/* slave private SCC RAM -> master private SCC RAM */
		if (copy_methods->scc_sink_to_src)
			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
		/* slave private SCC RAM -> slave private SCC RAM */
		if (copy_methods->scc_sink_to_sink)
			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
		else
			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
		break;
#endif

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
		if(copy_methods->any_to_any)
			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);

		else
		{
			void *obj = starpu_data_handle_to_pointer(handle, dst_node);
			void * ptr = NULL;
			starpu_ssize_t size = 0;
			handle->ops->pack_data(handle, src_node, &ptr, &size);
			ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, &req->async_channel);
			if (ret == 0)
				/* write is already finished, ptr was allocated in pack_data */
				free(ptr);

			/* For now, asynchronous is not supported */
			STARPU_ASSERT(ret == 0);
		}
		break;
		
	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
		if(copy_methods->any_to_any) 
			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
		else
		{
			void *obj = starpu_data_handle_to_pointer(handle, src_node);
			void * ptr = NULL;
			size_t size = 0;
			ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, &req->async_channel);
			if (ret == 0)
			{
				/* read is already finished, we can already unpack */
				handle->ops->unpack_data(handle, dst_node, ptr, size); 
				/* ptr is allocated in full_read */
				free(ptr);
			}

			/* For now, asynchronous is not supported */
			STARPU_ASSERT(ret == 0);
		}
		break;

	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM):	
		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
		break;
		
	default:
		STARPU_ABORT();
		break;
	}
	
	return ret;
#endif /* !SIMGRID */
}
struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t handle,
						   enum starpu_node_kind node_kind)
{
	struct starpu_task *conversion_task;

#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
	struct starpu_multiformat_interface *format_interface;
#endif

	conversion_task = starpu_task_create();
	conversion_task->name = "conversion_task";
	conversion_task->synchronous = 0;
	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);

#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
	/* The node does not really matter here */
	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
#endif

	_starpu_spin_lock(&handle->header_lock);
	handle->refcnt++;
	handle->busy_count++;
	_starpu_spin_unlock(&handle->header_lock);

	switch(node_kind)
	{
	case STARPU_CPU_RAM:
	case STARPU_SCC_RAM:
	case STARPU_SCC_SHM:
		switch (starpu_node_get_kind(handle->mf_node))
		{
		case STARPU_CPU_RAM:
		case STARPU_SCC_RAM:
		case STARPU_SCC_SHM:
			STARPU_ABORT();
#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
		case STARPU_CUDA_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->cuda_to_cpu_cl;
			break;
		}
#endif
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
		case STARPU_OPENCL_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->opencl_to_cpu_cl;
			break;
		}
#endif
#ifdef STARPU_USE_MIC
		case STARPU_MIC_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->mic_to_cpu_cl;
			break;
		}
#endif
		default:
			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
		}
		break;
#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
	case STARPU_CUDA_RAM:
		{
			struct starpu_multiformat_data_interface_ops *mf_ops;
			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
			conversion_task->cl = mf_ops->cpu_to_cuda_cl;
			break;
		}
#endif
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
	case STARPU_OPENCL_RAM:
	{
		struct starpu_multiformat_data_interface_ops *mf_ops;
		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
		conversion_task->cl = mf_ops->cpu_to_opencl_cl;
		break;
	}
#endif
#ifdef STARPU_USE_MIC
	case STARPU_MIC_RAM:
	{
		struct starpu_multiformat_data_interface_ops *mf_ops;
		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
		conversion_task->cl = mf_ops->cpu_to_mic_cl;
		break;
	}
#endif
	default:
		STARPU_ABORT();
	}

	STARPU_TASK_SET_MODE(conversion_task, STARPU_RW, 0);
	return conversion_task;
}