/*
 *	compute d = r
 *		descr[0] = d, descr[1] = r
 */
void cpu_codelet_func_2(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	/* simply copy r into d */
	uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);

	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
	STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));

	float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);

	memcpy(dst, src, nx*elemsize);
}
Example #2
0
void advance_eye_paths(void* buffers[], void* args_orig) {
  const timeval start_time = my_WallClockTime();

  // cl_args
  const starpu_args args;
  unsigned iteration;
  starpu_codelet_unpack_args(args_orig, &args, &iteration);

  // buffers
  // hit point static info
  HitPointPosition* const hit_points = reinterpret_cast<HitPointPosition* const>(STARPU_VECTOR_GET_PTR(buffers[0]));
  // eye paths
  EyePath* const eye_paths = reinterpret_cast<EyePath* const>(STARPU_VECTOR_GET_PTR(buffers[1]));
  const unsigned eye_paths_count = STARPU_VECTOR_GET_NX(buffers[1]);
  // seed buffer
  Seed* const seed_buffer = reinterpret_cast<Seed* const>(STARPU_VECTOR_GET_PTR(buffers[2]));



  advance_eye_paths_impl(hit_points, // hit_points_count,
                         eye_paths,         eye_paths_count,
                         seed_buffer, //    seed_buffer_count,
                         args.cpu_scene,
                         args.config->max_eye_path_depth,
                         starpu_combined_worker_get_size());

  const timeval end_time = my_WallClockTime();
  task_info("CPU", 0, starpu_combined_worker_get_size(), iteration, start_time, end_time, "(3) advance_eye_paths");
}
/* This kernel takes a buffer and scales it by a constant factor */
void vector_scal_cpu(void *buffers[], void *cl_arg)
{
	unsigned i;
	float *factor = cl_arg;

	/*
	 * The "buffers" array matches the task->handles array: for instance
	 * task->handles[0] is a handle that corresponds to a data with
	 * vector "interface", so that the first entry of the array in the
	 * codelet  is a pointer to a structure describing such a vector (ie.
	 * struct starpu_vector_interface *). Here, we therefore manipulate
	 * the buffers[0] element as a vector: nx gives the number of elements
	 * in the array, ptr gives the location of the array (that was possibly
	 * migrated/replicated), and elemsize gives the size of each elements.
	 */
	struct starpu_vector_interface *vector = buffers[0];

	/* length of the vector */
	unsigned n = STARPU_VECTOR_GET_NX(vector);

	/* get a pointer to the local copy of the vector : note that we have to
	 * cast it in (float *) since a vector could contain any type of
	 * elements so that the .ptr field is actually a uintptr_t */
	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);

	/* scale the vector */
	for (i = 0; i < n; i++)
		val[i] *= *factor;
}
void task_region_h(void *buffers[], void *_args)
{
	void **args = _args;
	struct starpu_vector_interface *_vector = buffers[0];
	int nx = STARPU_VECTOR_GET_NX(_vector);
	int elemsize = STARPU_VECTOR_GET_ELEMSIZE(_vector);
	int slice_base = STARPU_VECTOR_GET_SLICE_BASE(_vector);
	int *v = (int *)STARPU_VECTOR_GET_PTR(_vector);
	int f = (int)(intptr_t)args[0];
	int imin = (int)(intptr_t)args[1];
	int imax = (int)(intptr_t)args[2];
	int i;

	assert(elemsize == sizeof(v[0]));

	printf("depth 2 task, entry: vector ptr = %p, slice_base = %d, imin = %d, imax = %d\n", v, slice_base, imin, imax);

	for (i = imin; i < imax; i++)
	{
                assert(i-slice_base>=0);
                assert(i-slice_base<NX);
                (v-slice_base)[i] += f;
	}

	printf("depth 2 task ending\n");
}
void cublas_codelet_func_5(void *descr[], void *arg)
{
	float dot;
	struct cg_problem *pb = arg;
	float *vecd, *vecq;
	uint32_t size;
	
	/* get the vector */
	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
	size = STARPU_VECTOR_GET_NX(descr[0]);

	dot = cublasSdot (size, vecd, 1, vecq, 1);

	pb->alpha = pb->delta_new / dot;
}
/* functions/codelet to fill the bufferss*/
void fill_tmp_buffer(void *buffers[], void *cl_arg)
{
    int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
    int nx = STARPU_VECTOR_GET_NX(buffers[0]);
    int i;

    for (i=0; i<nx; i++)
        tmp[i]=nx+i;
}
static void memset_cuda(void *descr[], void *arg)
{
	STARPU_SKIP_IF_VALGRIND;

	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
}
void memset_cpu(void *descr[], void *arg)
{
	STARPU_SKIP_IF_VALGRIND;

	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	memset(ptr, 42, n * sizeof(*ptr));
}
void read_ghost(void *buffers[], void *cl_arg)
{
    int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
    int nx=STARPU_VECTOR_GET_NX(buffers[0]);
    int i;
    for(i=0; i<nx; i++)
    {
        assert(tmp[i]==nx+i);
    }
}
Example #10
0
void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
{
	TYPE alpha = *((TYPE *)arg);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);

	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
}
//! [To be included. You should update doxygen if you see this text.]
void scal_cpu_func(void *buffers[], void *_args)
{
    unsigned i;
    float *factor = _args;
    struct starpu_vector_interface *vector = buffers[0];
    unsigned n = STARPU_VECTOR_GET_NX(vector);
    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);

#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
    for (i = 0; i < n; i++)
        val[i] *= *factor;
}
Example #12
0
void task_region_g(void *buffers[], void *args)
{
    struct starpu_vector_interface *_vector_1 = buffers[0];
    int nx1 = STARPU_VECTOR_GET_NX(_vector_1);
    int *v1 = (int *)STARPU_VECTOR_GET_PTR(_vector_1);

    struct starpu_vector_interface *_vector_2 = buffers[1];
    int nx2 = STARPU_VECTOR_GET_NX(_vector_2);
    int *v2 = (int *)STARPU_VECTOR_GET_PTR(_vector_2);

    int f = (int)(intptr_t)args;

    STARPU_ASSERT(nx1 == nx2);

    printf("depth 1 task, entry: vector_1 ptr = %p\n", v1);
    printf("depth 1 task, entry: vector_2 ptr = %p\n", v2);
    printf("depth 1 task, entry: f = %d\n", f);

    fprintf(stderr, "cudaMemcpy: -->\n");
    cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice);
    fprintf(stderr, "cudaMemcpy: <--\n");
}
void cpu_codelet_func_6(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float *vecx, *vecd;
	uint32_t size;
	
	/* get the vector */
	vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	size = STARPU_VECTOR_GET_NX(descr[0]);

	STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
}
void cublas_codelet_func_7(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float *vecr, *vecq;
	uint32_t size;
	
	/* get the vector */
	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	size = STARPU_VECTOR_GET_NX(descr[0]);

	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
}
void cublas_codelet_func_3(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float dot;
	float *vec;
	uint32_t size;
	
	/* get the vector */
	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	size = STARPU_VECTOR_GET_NX(descr[0]);

	dot = cublasSdot (size, vec, 1, vec, 1);

	pb->delta_new = dot;
	pb->delta_0 = dot;
}
/*
 *	Dot product codelet
 */
void dot_cpu_func(void *descr[], void *cl_arg)
{
	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);

	//FPRINTF_MPI(stderr, "Before dot=%ld (adding %d elements...)\n", *dot, n);
	unsigned i;
	for (i = 0; i < n; i++)
	{
		//FPRINTF_MPI(stderr, "Adding %ld\n", local_x[i]);
		*dot += local_x[i];
	}
	//FPRINTF_MPI(stderr, "After dot=%ld\n", *dot);
}
Example #17
0
void axpy_opencl(void *buffers[], void *_args)
{
	TYPE *alpha = _args;
	int id, devid;
        cl_int err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid);
	if (err != CL_SUCCESS)
		STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
	err|= clSetKernelArg(kernel, 2, sizeof(n), &n);
	err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha);
	if (err)
		STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=n;
		size_t local;
                size_t s;
                cl_device_id device;

                starpu_opencl_get_device(devid, &device);

                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
                if (local > global)
			local=global;

		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
		if (err != CL_SUCCESS)
			STARPU_OPENCL_REPORT_ERROR(err);
	}
	starpu_opencl_release_kernel(kernel);
}
void cublas_codelet_func_8(void *descr[], void *arg)
{
	float dot;
	struct cg_problem *pb = arg;
	float *vecr;
	uint32_t size;
	
	/* get the vector */
	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	size = STARPU_VECTOR_GET_NX(descr[0]);

	dot = cublasSdot (size, vecr, 1, vecr, 1);

	pb->delta_old = pb->delta_new;
	pb->delta_new = dot;
	pb->beta = pb->delta_new/pb->delta_old;
}
void cublas_codelet_func_9(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float *vecd, *vecr;
	uint32_t size;
	
	/* get the vector */
	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);

	size = STARPU_VECTOR_GET_NX(descr[0]);

	/* d = beta d */
	cublasSscal(size, pb->beta, vecd, 1);

	/* d = r + d */
	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
}
void cpu_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
	STARPU_SKIP_IF_VALGRIND;

	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned *tmp = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);

	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);

	memcpy(tmp, v, nx*elemsize);

	unsigned i;
	for (i = 0; i < nx; i++)
	{
		v[i] = tmp[i] + 1;
	}
}
void dot_cpu_func(void *descr[], void *cl_arg)
{
	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	DOT_TYPE local_dot = 0.0;

	unsigned i;
	for (i = 0; i < n; i++)
	{
		local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i];
	}

	*dot = *dot + local_dot;
}
void cpu_codelet_func_3(void *descr[], void *arg)
{
	struct cg_problem *pb = arg;
	float dot;
	float *vec;
	int size;
	
	/* get the vector */
	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	size = (int)STARPU_VECTOR_GET_NX(descr[0]);

	dot = STARPU_SDOT(size, vec, 1, vec, 1);

	fprintf(stderr, "func 3 : DOT = %f\n", dot);

	pb->delta_new = dot;
	pb->delta_0 = dot;
}
void dot_cuda_func(void *descr[], void *cl_arg)
{
	DOT_TYPE current_dot;
	DOT_TYPE local_dot;

	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
	cudaStreamSynchronize(starpu_cuda_get_local_stream());

	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);

	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
	current_dot += local_dot;

	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
	cudaStreamSynchronize(starpu_cuda_get_local_stream());
}
void vector_scal_opencl(void *buffers[], void *_args)
{
	float *factor = _args;
	int id, devid, err;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_event event;

	/* length of the vector */
	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
	/* OpenCL copy of the vector pointer */
	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);

	id = starpu_worker_get_id();
	devid = starpu_worker_get_devid(id);

	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
					"vector_mult_opencl", devid);   /* Name of the codelet defined above */
	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
	if (err) STARPU_OPENCL_REPORT_ERROR(err);

	{
		size_t global=1;
		size_t local=1;
		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
	}

	clFinish(queue);
	starpu_opencl_collect_stats(event);
	clReleaseEvent(event);

	starpu_opencl_release_kernel(kernel);
}
void minmax_cpu_func(void *descr[], void *cl_arg)
{
	/* The array containing the values */
	TYPE *local_array = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *minmax = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);

	TYPE local_min = minmax[0];
	TYPE local_max = minmax[1];

	/* Compute the min and the max elements in the array */
	unsigned i;
	for (i = 0; i < n; i++)
	{
		TYPE val = local_array[i];
		local_min = STARPU_MIN(local_min, val);
		local_max = STARPU_MAX(local_max, val);
	}

	minmax[0] = local_min;
	minmax[1] = local_max;
}
Example #26
0
File: axpy.c Project: alucas/StarPU
#define CUBLASAXPY	cublasSaxpy

#define N	(16*1024*1024)

#define NBLOCKS	8

TYPE *vec_x, *vec_y;

/* descriptors for StarPU */
starpu_data_handle handle_y, handle_x;

void axpy_cpu(void *descr[], __attribute__((unused)) void *arg)
{
	TYPE alpha = *((TYPE *)arg);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);

	AXPY((int)n, alpha, block_x, 1, block_y, 1);
}

#ifdef STARPU_USE_CUDA
void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
{
	TYPE alpha = *((TYPE *)arg);

	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);

	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
void task_region_g(void *buffers[], void *args)
{
	struct starpu_vector_interface *_vector = buffers[0];

	int nx = STARPU_VECTOR_GET_NX(_vector);
	int *v = (int *)STARPU_VECTOR_GET_PTR(_vector);
	int f = (int)(intptr_t)args;
	
	printf("depth 1 task, entry: vector ptr = %p\n", v);

	{
		int i;

		for (i = 0; i < nx; i++)
		{
			v[i] += f;
		}
	}

	{
		const int half_nx = nx/2;

		starpu_data_handle_t vector_slice_1_handle;
		starpu_vector_data_register(&vector_slice_1_handle, STARPU_MAIN_RAM, (uintptr_t)&v[0], half_nx, sizeof(v[0]));
		printf("depth 1 task, block 1: vector_slice_1_handle = %p\n", vector_slice_1_handle);

		starpu_data_handle_t vector_slice_2_handle;
		starpu_vector_data_register(&vector_slice_2_handle, STARPU_MAIN_RAM, (uintptr_t)&v[half_nx], nx-half_nx, sizeof(v[0]));
		/* set slice base */
		starpu_omp_vector_annotate(vector_slice_2_handle, half_nx);
		printf("depth 1 task, block 1: vector_slice_2_handle = %p\n", vector_slice_2_handle);

	}

	void *cl_arg_1[3];
	void *cl_arg_2[3];

	{
		struct starpu_omp_task_region_attr attr;
		const int half_nx = nx/2;
		int i;

		starpu_data_handle_t vector_slice_1_handle = starpu_data_lookup(&v[0]);
		printf("depth 1 task, block 2: vector_slice_1_handle = %p\n", vector_slice_1_handle);

		starpu_data_handle_t vector_slice_2_handle = starpu_data_lookup(&v[half_nx]);
		printf("depth 1 task, block 2: vector_slice_2_handle = %p\n", vector_slice_2_handle);

		memset(&attr, 0, sizeof(attr));
		attr.cl.cpu_funcs[0]  = task_region_h;
		attr.cl.where         = STARPU_CPU;
		attr.cl.nbuffers      = 1;
		attr.cl.modes[0]      = STARPU_RW;
		attr.cl_arg_size      = 3*sizeof(void *);
		attr.cl_arg_free      = 0;
		attr.if_clause        = 1;
		attr.final_clause     = 0;
		attr.untied_clause    = 1;
		attr.mergeable_clause = 0;

		i = 0;

		cl_arg_1[0] = (void *)(intptr_t)i++;
		cl_arg_1[1] = (void *)(intptr_t)0;
		cl_arg_1[2] = (void *)(intptr_t)half_nx;
		attr.cl_arg           = cl_arg_1;
		attr.handles          = &vector_slice_1_handle;
		starpu_omp_task_region(&attr);

		cl_arg_2[0] = (void *)(intptr_t)i++;
		cl_arg_2[1] = (void *)(intptr_t)half_nx;
		cl_arg_2[2] = (void *)(intptr_t)nx;
		attr.cl_arg           = cl_arg_2;
		attr.handles          = &vector_slice_2_handle;
		starpu_omp_task_region(&attr);
	}

	starpu_omp_taskwait();
}
Example #28
0
#include <stdlib.h>

#define NLOOPS		1000
#define VECTORSIZE	1024

static starpu_data_handle v_handle;

/*
 *	Memset
 */

#ifdef STARPU_USE_CUDA
static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
{
	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);

	cudaMemset(buf, 42, length);
	cudaThreadSynchronize();
}
#endif

static void cpu_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
{
	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);

	memset(buf, 42, length);
}

static starpu_codelet memset_cl = {