Ejemplo n.º 1
0
void advance_eye_paths(void* buffers[], void* args_orig) {
  const timeval start_time = my_WallClockTime();

  // cl_args
  const starpu_args args;
  unsigned iteration;
  starpu_codelet_unpack_args(args_orig, &args, &iteration);

  // buffers
  // hit point static info
  HitPointPosition* const hit_points = reinterpret_cast<HitPointPosition* const>(STARPU_VECTOR_GET_PTR(buffers[0]));
  // eye paths
  EyePath* const eye_paths = reinterpret_cast<EyePath* const>(STARPU_VECTOR_GET_PTR(buffers[1]));
  const unsigned eye_paths_count = STARPU_VECTOR_GET_NX(buffers[1]);
  // seed buffer
  Seed* const seed_buffer = reinterpret_cast<Seed* const>(STARPU_VECTOR_GET_PTR(buffers[2]));



  advance_eye_paths_impl(hit_points, // hit_points_count,
                         eye_paths,         eye_paths_count,
                         seed_buffer, //    seed_buffer_count,
                         args.cpu_scene,
                         args.config->max_eye_path_depth,
                         starpu_combined_worker_get_size());

  const timeval end_time = my_WallClockTime();
  task_info("CPU", 0, starpu_combined_worker_get_size(), iteration, start_time, end_time, "(3) advance_eye_paths");
}
static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	/* printf("22\n"); */
	float *block 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *in 	= (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	float *out 	= (float *)STARPU_VECTOR_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);

	switch (s)
	{
		case 0:
			cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
Ejemplo n.º 3
0
void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	TYPE alpha = *((TYPE *)arg);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);

	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
}
void cublas_codelet_func_7(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float *vecr, *vecq;
	uint32_t size;
	
	/* get the vector */
	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	size = STARPU_VECTOR_GET_NX(descr[0]);

	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
}
/*
 *	compute d = r
 *		descr[0] = d, descr[1] = r
 */
void cpu_codelet_func_2(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	/* simply copy r into d */
	uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);

	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
	STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));

	float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);

	memcpy(dst, src, nx*elemsize);
}
void cpu_codelet_func_6(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float *vecx, *vecd;
	uint32_t size;
	
	/* get the vector */
	vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	size = STARPU_VECTOR_GET_NX(descr[0]);

	STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
}
void task_region_h(void *buffers[], void *_args)
{
	void **args = _args;
	struct starpu_vector_interface *_vector = buffers[0];
	int nx = STARPU_VECTOR_GET_NX(_vector);
	int elemsize = STARPU_VECTOR_GET_ELEMSIZE(_vector);
	int slice_base = STARPU_VECTOR_GET_SLICE_BASE(_vector);
	int *v = (int *)STARPU_VECTOR_GET_PTR(_vector);
	int f = (int)(intptr_t)args[0];
	int imin = (int)(intptr_t)args[1];
	int imax = (int)(intptr_t)args[2];
	int i;

	assert(elemsize == sizeof(v[0]));

	printf("depth 2 task, entry: vector ptr = %p, slice_base = %d, imin = %d, imax = %d\n", v, slice_base, imin, imax);

	for (i = imin; i < imax; i++)
	{
                assert(i-slice_base>=0);
                assert(i-slice_base<NX);
                (v-slice_base)[i] += f;
	}

	printf("depth 2 task ending\n");
}
/* This kernel takes a buffer and scales it by a constant factor */
void vector_scal_cpu(void *buffers[], void *cl_arg)
{
	unsigned i;
	float *factor = cl_arg;

	/*
	 * The "buffers" array matches the task->handles array: for instance
	 * task->handles[0] is a handle that corresponds to a data with
	 * vector "interface", so that the first entry of the array in the
	 * codelet  is a pointer to a structure describing such a vector (ie.
	 * struct starpu_vector_interface *). Here, we therefore manipulate
	 * the buffers[0] element as a vector: nx gives the number of elements
	 * in the array, ptr gives the location of the array (that was possibly
	 * migrated/replicated), and elemsize gives the size of each elements.
	 */
	struct starpu_vector_interface *vector = buffers[0];

	/* length of the vector */
	unsigned n = STARPU_VECTOR_GET_NX(vector);

	/* get a pointer to the local copy of the vector : note that we have to
	 * cast it in (float *) since a vector could contain any type of
	 * elements so that the .ptr field is actually a uintptr_t */
	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);

	/* scale the vector */
	for (i = 0; i < n; i++)
		val[i] *= *factor;
}
void cublas_codelet_func_5(void *descr[], void *arg)
{
	float dot;
	struct cg_problem *pb = arg;
	float *vecd, *vecq;
	uint32_t size;
	
	/* get the vector */
	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
	size = STARPU_VECTOR_GET_NX(descr[0]);

	dot = cublasSdot (size, vecd, 1, vecq, 1);

	pb->alpha = pb->delta_new / dot;
}
Ejemplo n.º 10
0
void dot_cpu_func(void *descr[], void *cl_arg)
{
	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	DOT_TYPE local_dot = 0.0;

	unsigned i;
	for (i = 0; i < n; i++)
	{
		local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i];
	}

	*dot = *dot + local_dot;
}
void memset_cpu(void *descr[], void *arg)
{
	STARPU_SKIP_IF_VALGRIND;

	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	memset(ptr, 42, n * sizeof(*ptr));
}
static void memset_cuda(void *descr[], void *arg)
{
	STARPU_SKIP_IF_VALGRIND;

	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
}
Ejemplo n.º 13
0
void cpu_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
	STARPU_SKIP_IF_VALGRIND;

	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned *tmp = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);

	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);

	memcpy(tmp, v, nx*elemsize);

	unsigned i;
	for (i = 0; i < nx; i++)
	{
		v[i] = tmp[i] + 1;
	}
}
Ejemplo n.º 14
0
/* functions/codelet to fill the bufferss*/
void fill_tmp_buffer(void *buffers[], void *cl_arg)
{
    int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
    int nx = STARPU_VECTOR_GET_NX(buffers[0]);
    int i;

    for (i=0; i<nx; i++)
        tmp[i]=nx+i;
}
void cublas_codelet_func_9(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float *vecd, *vecr;
	uint32_t size;
	
	/* get the vector */
	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	size = STARPU_VECTOR_GET_NX(descr[0]);

	/* d = beta d */
	cublasSscal(size, pb->beta, vecd, 1);

	/* d = r + d */
	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
}
Ejemplo n.º 16
0
void read_ghost(void *buffers[], void *cl_arg)
{
    int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
    int nx=STARPU_VECTOR_GET_NX(buffers[0]);
    int i;
    for(i=0; i<nx; i++)
    {
        assert(tmp[i]==nx+i);
    }
}
Ejemplo n.º 17
0
//! [To be included. You should update doxygen if you see this text.]
void scal_cpu_func(void *buffers[], void *_args)
{
    unsigned i;
    float *factor = _args;
    struct starpu_vector_interface *vector = buffers[0];
    unsigned n = STARPU_VECTOR_GET_NX(vector);
    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);

#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
    for (i = 0; i < n; i++)
        val[i] *= *factor;
}
Ejemplo n.º 18
0
void task_region_g(void *buffers[], void *args)
{
    struct starpu_vector_interface *_vector_1 = buffers[0];
    int nx1 = STARPU_VECTOR_GET_NX(_vector_1);
    int *v1 = (int *)STARPU_VECTOR_GET_PTR(_vector_1);

    struct starpu_vector_interface *_vector_2 = buffers[1];
    int nx2 = STARPU_VECTOR_GET_NX(_vector_2);
    int *v2 = (int *)STARPU_VECTOR_GET_PTR(_vector_2);

    int f = (int)(intptr_t)args;

    STARPU_ASSERT(nx1 == nx2);

    printf("depth 1 task, entry: vector_1 ptr = %p\n", v1);
    printf("depth 1 task, entry: vector_2 ptr = %p\n", v2);
    printf("depth 1 task, entry: f = %d\n", f);

    fprintf(stderr, "cudaMemcpy: -->\n");
    cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice);
    fprintf(stderr, "cudaMemcpy: <--\n");
}
Ejemplo n.º 19
0
void dot_cuda_func(void *descr[], void *cl_arg)
{
	DOT_TYPE current_dot;
	DOT_TYPE local_dot;

	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
	cudaStreamSynchronize(starpu_cuda_get_local_stream());

	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);

	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
	current_dot += local_dot;

	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
	cudaStreamSynchronize(starpu_cuda_get_local_stream());
}
void cpu_codelet_func_1(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
	uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
	uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);

	uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]);

	float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
	float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);


	uint32_t nrow;

	nrow = STARPU_CSR_GET_NROW(descr[0]);

	unsigned row;
	for (row = 0; row < nrow; row++)
	{
		float tmp = 0.0f;
		unsigned index;

		unsigned firstindex = rowptr[row] - firstentry;
		unsigned lastindex = rowptr[row+1] - firstentry;

		for (index = firstindex; index < lastindex; index++)
		{
			unsigned col;

			col = colind[index];
			tmp += nzval[index]*vecx[col];
		}

		vecr[row] = vecb[row] - tmp;
	}
}
void cublas_codelet_func_3(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float dot;
	float *vec;
	uint32_t size;
	
	/* get the vector */
	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	size = STARPU_VECTOR_GET_NX(descr[0]);

	dot = cublasSdot (size, vec, 1, vec, 1);

	pb->delta_new = dot;
	pb->delta_0 = dot;
}
/*
 *	Dot product codelet
 */
void dot_cpu_func(void *descr[], void *cl_arg)
{
	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);

	//FPRINTF_MPI(stderr, "Before dot=%ld (adding %d elements...)\n", *dot, n);
	unsigned i;
	for (i = 0; i < n; i++)
	{
		//FPRINTF_MPI(stderr, "Adding %ld\n", local_x[i]);
		*dot += local_x[i];
	}
	//FPRINTF_MPI(stderr, "After dot=%ld\n", *dot);
}
void cublas_codelet_func_8(void *descr[], void *arg)
{
	float dot;
	struct cg_problem *pb = arg;
	float *vecr;
	uint32_t size;
	
	/* get the vector */
	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	size = STARPU_VECTOR_GET_NX(descr[0]);

	dot = cublasSdot (size, vecr, 1, vecr, 1);

	pb->delta_old = pb->delta_new;
	pb->delta_new = dot;
	pb->beta = pb->delta_new/pb->delta_old;
}
void cpu_codelet_func_3(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float dot;
	float *vec;
	int size;
	
	/* get the vector */
	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	size = (int)STARPU_VECTOR_GET_NX(descr[0]);

	dot = STARPU_SDOT(size, vec, 1, vec, 1);

	fprintf(stderr, "func 3 : DOT = %f\n", dot);

	pb->delta_new = dot;
	pb->delta_0 = dot;
}
Ejemplo n.º 25
0
void minmax_cpu_func(void *descr[], void *cl_arg)
{
	/* The array containing the values */
	TYPE *local_array = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *minmax = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);

	TYPE local_min = minmax[0];
	TYPE local_max = minmax[1];

	/* Compute the min and the max elements in the array */
	unsigned i;
	for (i = 0; i < n; i++)
	{
		TYPE val = local_array[i];
		local_min = STARPU_MIN(local_min, val);
		local_max = STARPU_MAX(local_max, val);
	}

	minmax[0] = local_min;
	minmax[1] = local_max;
}
void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
	(*tokenptr)++;
}
Ejemplo n.º 27
0
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * See the GNU Lesser General Public License in COPYING.LGPL for more details.
 */

#include <starpu.h>
#include <starpu_opencl.h>
#include <CL/cl.h>

extern struct starpu_opencl_program opencl_code;

void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args)
{
        unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
	cl_kernel kernel;
	cl_command_queue queue;
	int id, devid, err;

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incA", devid);
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = 0;
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
/*
 *	Display codelet
 */
void display_cpu_func(void *descr[], void *cl_arg)
{
	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);

	FPRINTF_MPI(stderr, "Local=%ld\n", *local_x);
}
#include <starpu.h>

#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

#ifdef STARPU_USE_CUDA
extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
#endif

#ifdef STARPU_USE_OPENCL
extern "C" void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
struct starpu_opencl_program opencl_program;
#endif

extern "C" void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
{
	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);

	val[0] += 1.0f;
	val[1] += 1.0f;
}

int main(int argc, char **argv)
{
	int ret = 0;
	starpu_data_handle_t float_array_handle;
	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
        struct starpu_codelet cl;
	unsigned i;
	unsigned niter = 50;

	ret = starpu_init(NULL);
Ejemplo n.º 30
0
 */

#include <starpu_mpi.h>

#define NITER	2048

unsigned token = 42;
starpu_data_handle token_handle;

#ifdef STARPU_USE_CUDA
extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
#endif

void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
{
	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
	(*tokenptr)++;
}

static starpu_codelet increment_cl = {
	.where = STARPU_CPU|STARPU_CUDA,
#ifdef STARPU_USE_CUDA
	.cuda_func = increment_cuda,
#endif
	.cpu_func = increment_cpu,
	.nbuffers = 1
};

void increment_token(void)
{
	struct starpu_task *task = starpu_task_create();