void advance_eye_paths(void* buffers[], void* args_orig) { const timeval start_time = my_WallClockTime(); // cl_args const starpu_args args; unsigned iteration; starpu_codelet_unpack_args(args_orig, &args, &iteration); // buffers // hit point static info HitPointPosition* const hit_points = reinterpret_cast<HitPointPosition* const>(STARPU_VECTOR_GET_PTR(buffers[0])); // eye paths EyePath* const eye_paths = reinterpret_cast<EyePath* const>(STARPU_VECTOR_GET_PTR(buffers[1])); const unsigned eye_paths_count = STARPU_VECTOR_GET_NX(buffers[1]); // seed buffer Seed* const seed_buffer = reinterpret_cast<Seed* const>(STARPU_VECTOR_GET_PTR(buffers[2])); advance_eye_paths_impl(hit_points, // hit_points_count, eye_paths, eye_paths_count, seed_buffer, // seed_buffer_count, args.cpu_scene, args.config->max_eye_path_depth, starpu_combined_worker_get_size()); const timeval end_time = my_WallClockTime(); task_info("CPU", 0, starpu_combined_worker_get_size(), iteration, start_time, end_time, "(3) advance_eye_paths"); }
static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { /* printf("22\n"); */ float *block = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *in = (float *)STARPU_VECTOR_GET_PTR(descr[1]); float *out = (float *)STARPU_VECTOR_GET_PTR(descr[2]); unsigned dx = STARPU_MATRIX_GET_NX(descr[0]); unsigned dy = STARPU_MATRIX_GET_NY(descr[0]); unsigned ld = STARPU_MATRIX_GET_LD(descr[0]); switch (s) { case 0: cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1); break; #ifdef STARPU_USE_CUDA case 1: cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1); break; #endif default: STARPU_ABORT(); break; } }
void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { TYPE alpha = *((TYPE *)arg); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]); CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1); }
void cublas_codelet_func_7(void *descr[], void *arg) { struct cg_problem *pb = arg; float *vecr, *vecq; uint32_t size; /* get the vector */ vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]); size = STARPU_VECTOR_GET_NX(descr[0]); cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1); }
/* * compute d = r * descr[0] = d, descr[1] = r */ void cpu_codelet_func_2(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { /* simply copy r into d */ uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]); size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]); STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1])); STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1])); float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]); float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]); memcpy(dst, src, nx*elemsize); }
void cpu_codelet_func_6(void *descr[], void *arg) { struct cg_problem *pb = arg; float *vecx, *vecd; uint32_t size; /* get the vector */ vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]); size = STARPU_VECTOR_GET_NX(descr[0]); STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1); }
void task_region_h(void *buffers[], void *_args) { void **args = _args; struct starpu_vector_interface *_vector = buffers[0]; int nx = STARPU_VECTOR_GET_NX(_vector); int elemsize = STARPU_VECTOR_GET_ELEMSIZE(_vector); int slice_base = STARPU_VECTOR_GET_SLICE_BASE(_vector); int *v = (int *)STARPU_VECTOR_GET_PTR(_vector); int f = (int)(intptr_t)args[0]; int imin = (int)(intptr_t)args[1]; int imax = (int)(intptr_t)args[2]; int i; assert(elemsize == sizeof(v[0])); printf("depth 2 task, entry: vector ptr = %p, slice_base = %d, imin = %d, imax = %d\n", v, slice_base, imin, imax); for (i = imin; i < imax; i++) { assert(i-slice_base>=0); assert(i-slice_base<NX); (v-slice_base)[i] += f; } printf("depth 2 task ending\n"); }
/* This kernel takes a buffer and scales it by a constant factor */ void vector_scal_cpu(void *buffers[], void *cl_arg) { unsigned i; float *factor = cl_arg; /* * The "buffers" array matches the task->handles array: for instance * task->handles[0] is a handle that corresponds to a data with * vector "interface", so that the first entry of the array in the * codelet is a pointer to a structure describing such a vector (ie. * struct starpu_vector_interface *). Here, we therefore manipulate * the buffers[0] element as a vector: nx gives the number of elements * in the array, ptr gives the location of the array (that was possibly * migrated/replicated), and elemsize gives the size of each elements. */ struct starpu_vector_interface *vector = buffers[0]; /* length of the vector */ unsigned n = STARPU_VECTOR_GET_NX(vector); /* get a pointer to the local copy of the vector : note that we have to * cast it in (float *) since a vector could contain any type of * elements so that the .ptr field is actually a uintptr_t */ float *val = (float *)STARPU_VECTOR_GET_PTR(vector); /* scale the vector */ for (i = 0; i < n; i++) val[i] *= *factor; }
void cublas_codelet_func_5(void *descr[], void *arg) { float dot; struct cg_problem *pb = arg; float *vecd, *vecq; uint32_t size; /* get the vector */ vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]); STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1])); size = STARPU_VECTOR_GET_NX(descr[0]); dot = cublasSdot (size, vecd, 1, vecq, 1); pb->alpha = pb->delta_new / dot; }
void dot_cpu_func(void *descr[], void *cl_arg) { float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); DOT_TYPE local_dot = 0.0; unsigned i; for (i = 0; i < n; i++) { local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i]; } *dot = *dot + local_dot; }
void memset_cpu(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); memset(ptr, 42, n * sizeof(*ptr)); }
static void memset_cuda(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream()); }
void cpu_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { STARPU_SKIP_IF_VALGRIND; unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned *tmp = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]); unsigned nx = STARPU_VECTOR_GET_NX(descr[0]); size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]); memcpy(tmp, v, nx*elemsize); unsigned i; for (i = 0; i < nx; i++) { v[i] = tmp[i] + 1; } }
/* functions/codelet to fill the bufferss*/ void fill_tmp_buffer(void *buffers[], void *cl_arg) { int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]); int nx = STARPU_VECTOR_GET_NX(buffers[0]); int i; for (i=0; i<nx; i++) tmp[i]=nx+i; }
void cublas_codelet_func_9(void *descr[], void *arg) { struct cg_problem *pb = arg; float *vecd, *vecr; uint32_t size; /* get the vector */ vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]); size = STARPU_VECTOR_GET_NX(descr[0]); /* d = beta d */ cublasSscal(size, pb->beta, vecd, 1); /* d = r + d */ cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1); }
void read_ghost(void *buffers[], void *cl_arg) { int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]); int nx=STARPU_VECTOR_GET_NX(buffers[0]); int i; for(i=0; i<nx; i++) { assert(tmp[i]==nx+i); } }
//! [To be included. You should update doxygen if you see this text.] void scal_cpu_func(void *buffers[], void *_args) { unsigned i; float *factor = _args; struct starpu_vector_interface *vector = buffers[0]; unsigned n = STARPU_VECTOR_GET_NX(vector); float *val = (float *)STARPU_VECTOR_GET_PTR(vector); #pragma omp parallel for num_threads(starpu_combined_worker_get_size()) for (i = 0; i < n; i++) val[i] *= *factor; }
void task_region_g(void *buffers[], void *args) { struct starpu_vector_interface *_vector_1 = buffers[0]; int nx1 = STARPU_VECTOR_GET_NX(_vector_1); int *v1 = (int *)STARPU_VECTOR_GET_PTR(_vector_1); struct starpu_vector_interface *_vector_2 = buffers[1]; int nx2 = STARPU_VECTOR_GET_NX(_vector_2); int *v2 = (int *)STARPU_VECTOR_GET_PTR(_vector_2); int f = (int)(intptr_t)args; STARPU_ASSERT(nx1 == nx2); printf("depth 1 task, entry: vector_1 ptr = %p\n", v1); printf("depth 1 task, entry: vector_2 ptr = %p\n", v2); printf("depth 1 task, entry: f = %d\n", f); fprintf(stderr, "cudaMemcpy: -->\n"); cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice); fprintf(stderr, "cudaMemcpy: <--\n"); }
void dot_cuda_func(void *descr[], void *cl_arg) { DOT_TYPE current_dot; DOT_TYPE local_dot; float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemcpyAsync(¤t_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1); /* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */ current_dot += local_dot; cudaMemcpyAsync(dot, ¤t_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); }
void cpu_codelet_func_1(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]); uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]); uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]); uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]); float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]); float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]); float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]); uint32_t nrow; nrow = STARPU_CSR_GET_NROW(descr[0]); unsigned row; for (row = 0; row < nrow; row++) { float tmp = 0.0f; unsigned index; unsigned firstindex = rowptr[row] - firstentry; unsigned lastindex = rowptr[row+1] - firstentry; for (index = firstindex; index < lastindex; index++) { unsigned col; col = colind[index]; tmp += nzval[index]*vecx[col]; } vecr[row] = vecb[row] - tmp; } }
void cublas_codelet_func_3(void *descr[], void *arg) { struct cg_problem *pb = arg; float dot; float *vec; uint32_t size; /* get the vector */ vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]); size = STARPU_VECTOR_GET_NX(descr[0]); dot = cublasSdot (size, vec, 1, vec, 1); pb->delta_new = dot; pb->delta_0 = dot; }
/* * Dot product codelet */ void dot_cpu_func(void *descr[], void *cl_arg) { long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]); //FPRINTF_MPI(stderr, "Before dot=%ld (adding %d elements...)\n", *dot, n); unsigned i; for (i = 0; i < n; i++) { //FPRINTF_MPI(stderr, "Adding %ld\n", local_x[i]); *dot += local_x[i]; } //FPRINTF_MPI(stderr, "After dot=%ld\n", *dot); }
void cublas_codelet_func_8(void *descr[], void *arg) { float dot; struct cg_problem *pb = arg; float *vecr; uint32_t size; /* get the vector */ vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]); size = STARPU_VECTOR_GET_NX(descr[0]); dot = cublasSdot (size, vecr, 1, vecr, 1); pb->delta_old = pb->delta_new; pb->delta_new = dot; pb->beta = pb->delta_new/pb->delta_old; }
void cpu_codelet_func_3(void *descr[], void *arg) { struct cg_problem *pb = arg; float dot; float *vec; int size; /* get the vector */ vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]); size = (int)STARPU_VECTOR_GET_NX(descr[0]); dot = STARPU_SDOT(size, vec, 1, vec, 1); fprintf(stderr, "func 3 : DOT = %f\n", dot); pb->delta_new = dot; pb->delta_0 = dot; }
void minmax_cpu_func(void *descr[], void *cl_arg) { /* The array containing the values */ TYPE *local_array = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *minmax = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); TYPE local_min = minmax[0]; TYPE local_max = minmax[1]; /* Compute the min and the max elements in the array */ unsigned i; for (i = 0; i < n; i++) { TYPE val = local_array[i]; local_min = STARPU_MIN(local_min, val); local_max = STARPU_MAX(local_max, val); } minmax[0] = local_min; minmax[1] = local_max; }
void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]); (*tokenptr)++; }
* This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include <starpu.h> #include <starpu_opencl.h> #include <CL/cl.h> extern struct starpu_opencl_program opencl_code; void opencl_codelet_incA(void *descr[], __attribute__ ((unused)) void *_args) { unsigned *val = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]); cl_kernel kernel; cl_command_queue queue; int id, devid, err; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "incA", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val); if (err) STARPU_OPENCL_REPORT_ERROR(err); {
/* * Display codelet */ void display_cpu_func(void *descr[], void *cl_arg) { long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]); FPRINTF_MPI(stderr, "Local=%ld\n", *local_x); }
#include <starpu.h> #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0) #ifdef STARPU_USE_CUDA extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args); #endif #ifdef STARPU_USE_OPENCL extern "C" void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args); struct starpu_opencl_program opencl_program; #endif extern "C" void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args) { float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]); val[0] += 1.0f; val[1] += 1.0f; } int main(int argc, char **argv) { int ret = 0; starpu_data_handle_t float_array_handle; float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f}; struct starpu_codelet cl; unsigned i; unsigned niter = 50; ret = starpu_init(NULL);
*/ #include <starpu_mpi.h> #define NITER 2048 unsigned token = 42; starpu_data_handle token_handle; #ifdef STARPU_USE_CUDA extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args); #endif void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args) { unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]); (*tokenptr)++; } static starpu_codelet increment_cl = { .where = STARPU_CPU|STARPU_CUDA, #ifdef STARPU_USE_CUDA .cuda_func = increment_cuda, #endif .cpu_func = increment_cpu, .nbuffers = 1 }; void increment_token(void) { struct starpu_task *task = starpu_task_create();