예제 #1
0
/*
 * Wait for termination
 */
void wait_end_tasks(int rank)
{
	int bz;
	int nbz = get_nbz();

	for (bz = 0; bz < nbz; bz++)
	{
		if (get_block_mpi_node(bz) == rank)
		{
			/* Wait for the task producing block "bz" */
			starpu_tag_wait(TAG_FINISH(bz));

			/* Get the result back to memory */
			struct block_description *block = get_block_description(bz);
			starpu_data_acquire(block->layers_handle[0], STARPU_R);
			starpu_data_acquire(block->layers_handle[1], STARPU_R);
			/* the data_acquire here is done to make sure
			 * the data is sent back to the ram memory, we
			 * can safely do a data_release, to avoid the
			 * data_unregister to block later on
			 */
			starpu_data_release(block->layers_handle[0]);
			starpu_data_release(block->layers_handle[1]);
		}
	}
}
예제 #2
0
/*
 * Create all the tasks
 */
void create_tasks(int rank)
{
	int iter;
	int bz;
	int niter = get_niter();
	int nbz = get_nbz();

	for (bz = 0; bz < nbz; bz++)
	{
		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
			create_start_task(bz, +1);
		if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
			create_start_task(bz, -1);
	}

	for (iter = 0; iter <= niter; iter++)
	{
	     for (bz = 0; bz < nbz; bz++)
	     {
		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))
			  create_task_update(iter, bz, rank);

	     }
	     for (bz = 0; bz < nbz; bz++)
	     {
		     if (iter != niter)
		     {
			     if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
				     create_task_save(iter, bz, +1, rank);

			     if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz-1) == rank))
				     create_task_save(iter, bz, -1, rank);
		     }
	     }
	}
}
예제 #3
0
/*
 * cl_update (CUDA version)
 */
static void update_func_cuda(void *descr[], void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
    if (block->bz == 0)
        fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
    else
        DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
#ifdef STARPU_USE_MPI
    int rank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    DEBUG( "!!!           RANK %d              !!!\n", rank);
#endif
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

    unsigned block_size_z = get_block_size(block->bz);
    unsigned i;
    update_per_worker[workerid]++;

    struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks()*1000};
    gettimeofday(&tv, NULL);
    timersub(&tv, &start, &tv2);
    timersub(&tv2, &last_tick[block->bz], &diff);
    while (timercmp(&diff, &delta, >=)) {
        timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
        timersub(&tv2, &last_tick[block->bz], &diff);
        if (who_runs_what_index[block->bz] < who_runs_what_len)
            who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
    }

    if (who_runs_what_index[block->bz] < who_runs_what_len)
        who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);

    /*
     *	Load neighbours' boundaries : TOP
     */

    /* The offset along the z axis is (block_size_z + K) */
    load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
    load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);

    /*
     *	Load neighbours' boundaries : BOTTOM
     */
    load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
    load_subblock_from_buffer_cuda(descr[1], descr[5], 0);

    /*
     *	Stencils ... do the actual work here :) TODO
     */

    for (i=1; i<=K; i++)
    {
        starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
        TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;

        /* Shadow data */
        cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);

        /* And perform actual computation */
#ifdef LIFE
        cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
#else
        cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
#endif /* LIFE */
    }

    cudaError_t cures;
    if ((cures = cudaThreadSynchronize()) != cudaSuccess)
        STARPU_CUDA_REPORT_ERROR(cures);

}
#endif /* STARPU_USE_CUDA */

/*
 * cl_update (CPU version)
 */
static void update_func_cpu(void *descr[], void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
    if (block->bz == 0)
        fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
    else
        DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
#ifdef STARPU_USE_MPI
    int rank = 0;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    DEBUG( "!!!           RANK %d            !!!\n", rank);
#endif
    DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");

    unsigned block_size_z = get_block_size(block->bz);
    unsigned i;
    update_per_worker[workerid]++;

    struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks() * 1000};
    gettimeofday(&tv, NULL);
    timersub(&tv, &start, &tv2);
    timersub(&tv2, &last_tick[block->bz], &diff);
    while (timercmp(&diff, &delta, >=)) {
        timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
        timersub(&tv2, &last_tick[block->bz], &diff);
        if (who_runs_what_index[block->bz] < who_runs_what_len)
            who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
    }

    if (who_runs_what_index[block->bz] < who_runs_what_len)
        who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);

    /*
     *	Load neighbours' boundaries : TOP
     */

    /* The offset along the z axis is (block_size_z + K) */
    load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
    load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);

    /*
     *	Load neighbours' boundaries : BOTTOM
     */
    load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
    load_subblock_from_buffer_cpu(descr[1], descr[5], 0);

    /*
     *	Stencils ... do the actual work here :) TODO
     */

    for (i=1; i<=K; i++)
    {
        starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
        TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;

        /* Shadow data */
        unsigned ldy = oldb->ldy, ldz = oldb->ldz;
        unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
        unsigned x, y, z;
        unsigned stepx = 1;
        unsigned stepy = 1;
        unsigned stepz = 1;
        unsigned idx = 0;
        unsigned idy = 0;
        unsigned idz = 0;
        TYPE *ptr = old;

#		include "shadow.h"

        /* And perform actual computation */
#ifdef LIFE
        life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
#else
        memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
#endif /* LIFE */
    }
}

/* Performance model and codelet structure */
static struct starpu_perfmodel_t cl_update_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "cl_update"
};

starpu_codelet cl_update = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = update_func_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = update_func_cuda,
#endif
    .model = &cl_update_model,
    .nbuffers = 6
};

/*
 * Save the block internal boundaries to give them to our neighbours.
 */

/* CPU version */
static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
        starpu_block_interface_t *boundary,
        unsigned firstz)
{
    /* Sanity checks */
    STARPU_ASSERT(block->nx == boundary->nx);
    STARPU_ASSERT(block->ny == boundary->ny);
    STARPU_ASSERT(boundary->nz == K);

    /* NB: this is not fully garanteed ... but it's *very* likely and that
     * makes our life much simpler */
    STARPU_ASSERT(block->ldy == boundary->ldy);
    STARPU_ASSERT(block->ldz == boundary->ldz);

    /* We do a contiguous memory transfer */
    size_t boundary_size = K*block->ldz*block->elemsize;

    unsigned offset = firstz*block->ldz;
    TYPE *block_data = (TYPE *)block->ptr;
    TYPE *boundary_data = (TYPE *)boundary->ptr;
    memcpy(boundary_data, &block_data[offset], boundary_size);
}

/* CUDA version */
#ifdef STARPU_USE_CUDA
static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
        starpu_block_interface_t *boundary,
        unsigned firstz)
{
    /* Sanity checks */
    STARPU_ASSERT(block->nx == boundary->nx);
    STARPU_ASSERT(block->ny == boundary->ny);
    STARPU_ASSERT(boundary->nz == K);

    /* NB: this is not fully garanteed ... but it's *very* likely and that
     * makes our life much simpler */
    STARPU_ASSERT(block->ldy == boundary->ldy);
    STARPU_ASSERT(block->ldz == boundary->ldz);

    /* We do a contiguous memory transfer */
    size_t boundary_size = K*block->ldz*block->elemsize;

    unsigned offset = firstz*block->ldz;
    TYPE *block_data = (TYPE *)block->ptr;
    TYPE *boundary_data = (TYPE *)boundary->ptr;
    cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
}
#endif /* STARPU_USE_CUDA */

/* Record how many top/bottom saves each worker performed */
unsigned top_per_worker[STARPU_NMAXWORKERS];
unsigned bottom_per_worker[STARPU_NMAXWORKERS];

/* top save, CPU version */
static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    top_per_worker[workerid]++;

    DEBUG( "DO SAVE Bottom block %d\n", block->bz);

    /* The offset along the z axis is (block_size_z + K)- K */
    unsigned block_size_z = get_block_size(block->bz);

    load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
    load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
}

/* bottom save, CPU version */
static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    bottom_per_worker[workerid]++;

    DEBUG( "DO SAVE Top block %d\n", block->bz);

    load_subblock_into_buffer_cpu(descr[0], descr[2], K);
    load_subblock_into_buffer_cpu(descr[1], descr[3], K);
}

/* top save, CUDA version */
#ifdef STARPU_USE_CUDA
static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    top_per_worker[workerid]++;

    DEBUG( "DO SAVE Top block %d\n", block->bz);

    /* The offset along the z axis is (block_size_z + K)- K */
    unsigned block_size_z = get_block_size(block->bz);

    load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
    load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
    cudaThreadSynchronize();
}

/* bottom save, CUDA version */
static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
{
    struct block_description *block = arg;
    int workerid = starpu_worker_get_id();
    bottom_per_worker[workerid]++;

    DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);

    load_subblock_into_buffer_cuda(descr[0], descr[2], K);
    load_subblock_into_buffer_cuda(descr[1], descr[3], K);
    cudaThreadSynchronize();
}
#endif /* STARPU_USE_CUDA */

/* Performance models and codelet for save */
static struct starpu_perfmodel_t save_cl_bottom_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "save_cl_bottom"
};

static struct starpu_perfmodel_t save_cl_top_model = {
    .type = STARPU_HISTORY_BASED,
    .symbol = "save_cl_top"
};

starpu_codelet save_cl_bottom = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = dummy_func_bottom_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = dummy_func_bottom_cuda,
#endif
    .model = &save_cl_bottom_model,
    .nbuffers = 4
};

starpu_codelet save_cl_top = {
    .where =
#ifdef STARPU_USE_CUDA
    STARPU_CUDA|
#endif
    STARPU_CPU,
    .cpu_func = dummy_func_top_cpu,
#ifdef STARPU_USE_CUDA
    .cuda_func = dummy_func_top_cuda,
#endif
    .model = &save_cl_top_model,
    .nbuffers = 4
};