void insert_work_for_one_element(struct element *el)
{
    starpu_data_handle_t tmp_recv;
    starpu_data_handle_t tmp_send;

    starpu_vector_data_register(&tmp_recv, -1, 0, el->tag, sizeof(int));
    starpu_vector_data_register(&tmp_send, -1, 0, el->tag, sizeof(int));

    //Emulate the work to fill the send buffer
    starpu_insert_task(&fill_tmp_buffer_cl,
                       STARPU_W,tmp_send,
                       0);
    //Send operation
    starpu_insert_task(&submitted_order,
                       STARPU_RW,el->ensure_submitted_order_send,
                       STARPU_W,tmp_send,
                       0);
    starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);

    //Recv operation for current element
    starpu_insert_task(&submitted_order,
                       STARPU_RW,el->ensure_submitted_order_recv,
                       STARPU_W,tmp_recv,
                       0);
    starpu_mpi_irecv_detached(tmp_recv,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
    //Emulate the "reading" of the recv value.
    starpu_insert_task(&read_ghost_value_cl,
                       STARPU_R,tmp_recv,
                       0);

    starpu_data_unregister_submit(tmp_send);
    starpu_data_unregister_submit(tmp_recv);
}
/* Post MPI recv */
static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, int local_rank)
{
	struct block_description *descr = get_block_description(z);
	STARPU_ASSERT(descr->mpi_node != local_rank);

	struct block_description *neighbour = descr->boundary_blocks[(1+dir)/2];
	int source = descr->mpi_node;
	STARPU_ASSERT(neighbour->mpi_node == local_rank);

	/* Receive our neighbour's border in our neighbour copy */
	starpu_data_handle_t handle0 = neighbour->boundaries_handle[(1-dir)/2][0];
	starpu_data_handle_t handle1 = neighbour->boundaries_handle[(1-dir)/2][1];

	starpu_mpi_irecv_detached(handle0, source, MPI_TAG0(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
	starpu_mpi_irecv_detached(handle1, source, MPI_TAG1(z, iter, dir), MPI_COMM_WORLD, recv_done, (void*)(uintptr_t)z);
}
int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, starpu_tag_t tag)
{
	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
	*tagptr = tag;

	return starpu_mpi_irecv_detached(data_handle, source, data_tag, comm, starpu_mpi_unlock_tag_callback, tagptr);
}
int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *data_tag, MPI_Comm *comm, starpu_tag_t tag)
{
	struct arg_array *arg = malloc(sizeof(struct arg_array));

	arg->array_size = array_size;
	arg->tag = tag;

	unsigned elem;
	for (elem = 0; elem < array_size; elem++)
	{
		starpu_mpi_irecv_detached(data_handle[elem], source[elem], data_tag[elem], comm[elem], starpu_mpi_array_unlock_callback, arg);
	}

	return 0;
}
int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
{
	int rank;
	int x;
	struct _callback_arg *callback_arg = NULL;
	void (*callback_func)(void *) = NULL;
	void (*callback)(void *);

	starpu_mpi_comm_rank(comm, &rank);

	callback = (rank == root) ? scallback : rcallback;
	if (callback)
	{
		callback_func = _callback_collective;
		callback_arg = malloc(sizeof(struct _callback_arg));
		callback_arg->count = 0;
		callback_arg->nb = 0;
		callback_arg->callback = (rank == root) ? scallback : rcallback;
		callback_arg->arg = (rank == root) ? sarg : rarg;

		for(x = 0; x < count ; x++)
		{
			if (data_handles[x])
			{
				int owner = starpu_mpi_data_get_rank(data_handles[x]);
				int data_tag = starpu_mpi_data_get_tag(data_handles[x]);
				STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
				if ((rank == root) && (owner != root))
				{
					callback_arg->count ++;
				}
				if ((rank != root) && (owner == rank))
				{
					callback_arg->count ++;
				}
			}
		}

		if (!callback_arg->count)
		{
			free(callback_arg);
			return 0;
		}
	}

	for(x = 0; x < count ; x++)
	{
		if (data_handles[x])
		{
			int owner = starpu_mpi_data_get_rank(data_handles[x]);
			int data_tag = starpu_mpi_data_get_tag(data_handles[x]);
			STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
			if ((rank == root) && (owner != root))
			{
				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
				starpu_mpi_isend_detached(data_handles[x], owner, data_tag, comm, callback_func, callback_arg);
			}
			if ((rank != root) && (owner == rank))
			{
				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
				starpu_mpi_irecv_detached(data_handles[x], root, data_tag, comm, callback_func, callback_arg);
			}
		}
	}
	return 0;
}
int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detached)
{
	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
	int i;

	if (rank%2)
	{
		starpu_mpi_send(handles[0], other_rank, 0, MPI_COMM_WORLD);
		starpu_mpi_send(handles[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
		for(i=1 ; i<NB-1 ; i++)
		{
			starpu_mpi_send(handles[i], other_rank, i, MPI_COMM_WORLD);
		}
		return 0;
	}
	else
	{
		int ret=0;
		starpu_mpi_req req[NB];
		int received = 0;

		if (detached)
		{
			starpu_mpi_irecv_detached(handles[0], other_rank, 0, MPI_COMM_WORLD, callback, &received);
		}
		else
		{
			memset(req, 0, NB*sizeof(starpu_mpi_req));
			starpu_mpi_irecv(handles[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
			STARPU_ASSERT(req[0] != NULL);
		}

		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
		usleep(2000000);
		for(i=1 ; i<NB ; i++)
		{
			if (detached)
			{
				starpu_mpi_irecv_detached(handles[i], other_rank, i, MPI_COMM_WORLD, callback, &received);
			}
			else
			{
				starpu_mpi_irecv(handles[i], &req[i], other_rank, i, MPI_COMM_WORLD);
				STARPU_ASSERT(req[i] != NULL);
			}
		}

		if (detached)
		{
			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			while (received != NB)
			{
			     FPRINTF_MPI(stderr, "Received %d messages\n", received);
			     STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			}
			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
		}
		else
		{
			for(i=0 ; i<NB ; i++)
			{
			     starpu_mpi_wait(&req[i], MPI_STATUS_IGNORE);
			     func(handles[i], i, rank, &ret);
			}
		}
		return ret;
	}
}
int do_test(int rank, int sdetached, int rdetached)
{
	int ret, i;
	int val[2];
	starpu_data_handle_t data[2];

	ret = starpu_init(NULL);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
        ret = starpu_mpi_init(NULL, NULL, 0);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");

	if (rank == 1)
	{
		val[0] = VAL0;
		val[1] = VAL1;
	}
	else
	{
		val[0] = -1;
		val[1] = -1;
	}
	starpu_variable_data_register(&data[0], STARPU_MAIN_RAM, (uintptr_t)&val[0], sizeof(val[0]));
	starpu_variable_data_register(&data[1], STARPU_MAIN_RAM, (uintptr_t)&val[1], sizeof(val[1]));
	starpu_mpi_data_register(data[0], 77, 1);
	starpu_mpi_data_register(data[1], 88, 1);

	if (rank == 1)
	{
		for(i=1 ; i>=0 ; i--)
		{
			if (sdetached)
				starpu_mpi_isend_detached(data[i], 0, starpu_data_get_tag(data[i]), MPI_COMM_WORLD, NULL, NULL);
			else
				starpu_mpi_send(data[i], 0, starpu_data_get_tag(data[i]), MPI_COMM_WORLD);
		}
	}
	else if (rank == 0)
	{
		int received = 0;

		for(i=0 ; i<2 ; i++)
			FPRINTF_MPI(stderr, "Value[%d] = %d\n", i, val[i]);
		for(i=0 ; i<2 ; i++)
		{
			if (rdetached)
				starpu_mpi_irecv_detached(data[i], 1, starpu_data_get_tag(data[i]), MPI_COMM_WORLD, callback, &received);
			else
				starpu_mpi_recv(data[i], 1, starpu_data_get_tag(data[i]), MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		}

		if (rdetached)
		{
			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			while (received != 2)
			{
				FPRINTF_MPI(stderr, "Received %d messages\n", received);
				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			}
			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
		}

		for(i=0 ; i<2 ; i++)
			starpu_data_acquire(data[i], STARPU_R);
		for(i=0 ; i<2 ; i++)
			FPRINTF_MPI(stderr, "Value[%d] = %d\n", i, val[i]);
		for(i=0 ; i<2 ; i++)
			starpu_data_release(data[i]);
	}
	FPRINTF_MPI(stderr, "Waiting ...\n");
	starpu_task_wait_for_all();

	starpu_data_unregister(data[0]);
	starpu_data_unregister(data[1]);

	if (rank == 0)
	{
		ret = (val[0] == VAL0 && val[1] == VAL1) ? 0 : 1;
	}
	starpu_mpi_shutdown();
	starpu_shutdown();
	return ret;
}
int main(int argc, char **argv)
{
	int ret, rank, size;

	ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
	ret = starpu_mpi_init(NULL, NULL, 1);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);

	if (size < 2)
	{
		if (rank == 0)
			FPRINTF(stderr, "We need at least 2 processes.\n");

		MPI_Finalize();
		return STARPU_TEST_SKIPPED;
	}


	starpu_vector_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, 1, sizeof(token));

	int nloops = NITER;
	int loop;

	int last_loop = nloops - 1;
	int last_rank = size - 1;

	for (loop = 0; loop < nloops; loop++)
	{
		int tag = loop*size + rank;

		if (loop == 0 && rank == 0)
		{
			token = 0;
			FPRINTF(stdout, "Start with token value %u\n", token);
		}
		else
		{
			starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
		}

		increment_token();

		if (loop == last_loop && rank == last_rank)
		{
			starpu_data_acquire(token_handle, STARPU_R);
			FPRINTF(stdout, "Finished : token value %u\n", token);
			starpu_data_release(token_handle);
		}
		else
		{
			starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
		}
	}

	starpu_task_wait_for_all();

	starpu_data_unregister(token_handle);
	starpu_mpi_shutdown();
	starpu_shutdown();

	if (rank == last_rank)
	{
		FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
		STARPU_ASSERT(token == nloops*size);
	}

	return 0;
}
int main(int argc, char **argv)
{
	int rank, nodes;
	int ret=0;
	int compare=0;

	ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
	ret = starpu_mpi_init(&argc, &argv, 1);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);

	if (nodes < 2)
	{
		fprintf(stderr, "This program needs at least 2 nodes (%d available)\n", nodes);
		ret = 77;
	}
	else
	{
		starpu_data_handle_t handle;
		starpu_data_handle_t handle2;

		double real[2] = {4.0, 2.0};
		double imaginary[2] = {7.0, 9.0};

		double real2[2] = {14.0, 12.0};
		double imaginary2[2] = {17.0, 19.0};

		if (rank == 1)
		{
			real[0] = 0.0;
			real[1] = 0.0;
			imaginary[0] = 0.0;
			imaginary[1] = 0.0;
		}

		starpu_complex_data_register(&handle, STARPU_MAIN_RAM, real, imaginary, 2);
		starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);

		if (rank == 0)
		{
			int *compare_ptr = &compare;

			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);

			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
			starpu_task_insert(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
		}
		else if (rank == 1)
		{
			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
		}

		starpu_task_wait_for_all();

		starpu_data_unregister(handle);
		starpu_data_unregister(handle2);
	}

	starpu_mpi_shutdown();
	starpu_shutdown();

	if (rank == 0) return !compare; else return ret;
}