/* * compute d = r * descr[0] = d, descr[1] = r */ void cpu_codelet_func_2(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { /* simply copy r into d */ uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]); size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]); STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1])); STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1])); float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]); float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]); memcpy(dst, src, nx*elemsize); }
void advance_eye_paths(void* buffers[], void* args_orig) { const timeval start_time = my_WallClockTime(); // cl_args const starpu_args args; unsigned iteration; starpu_codelet_unpack_args(args_orig, &args, &iteration); // buffers // hit point static info HitPointPosition* const hit_points = reinterpret_cast<HitPointPosition* const>(STARPU_VECTOR_GET_PTR(buffers[0])); // eye paths EyePath* const eye_paths = reinterpret_cast<EyePath* const>(STARPU_VECTOR_GET_PTR(buffers[1])); const unsigned eye_paths_count = STARPU_VECTOR_GET_NX(buffers[1]); // seed buffer Seed* const seed_buffer = reinterpret_cast<Seed* const>(STARPU_VECTOR_GET_PTR(buffers[2])); advance_eye_paths_impl(hit_points, // hit_points_count, eye_paths, eye_paths_count, seed_buffer, // seed_buffer_count, args.cpu_scene, args.config->max_eye_path_depth, starpu_combined_worker_get_size()); const timeval end_time = my_WallClockTime(); task_info("CPU", 0, starpu_combined_worker_get_size(), iteration, start_time, end_time, "(3) advance_eye_paths"); }
/* This kernel takes a buffer and scales it by a constant factor */ void vector_scal_cpu(void *buffers[], void *cl_arg) { unsigned i; float *factor = cl_arg; /* * The "buffers" array matches the task->handles array: for instance * task->handles[0] is a handle that corresponds to a data with * vector "interface", so that the first entry of the array in the * codelet is a pointer to a structure describing such a vector (ie. * struct starpu_vector_interface *). Here, we therefore manipulate * the buffers[0] element as a vector: nx gives the number of elements * in the array, ptr gives the location of the array (that was possibly * migrated/replicated), and elemsize gives the size of each elements. */ struct starpu_vector_interface *vector = buffers[0]; /* length of the vector */ unsigned n = STARPU_VECTOR_GET_NX(vector); /* get a pointer to the local copy of the vector : note that we have to * cast it in (float *) since a vector could contain any type of * elements so that the .ptr field is actually a uintptr_t */ float *val = (float *)STARPU_VECTOR_GET_PTR(vector); /* scale the vector */ for (i = 0; i < n; i++) val[i] *= *factor; }
void task_region_h(void *buffers[], void *_args) { void **args = _args; struct starpu_vector_interface *_vector = buffers[0]; int nx = STARPU_VECTOR_GET_NX(_vector); int elemsize = STARPU_VECTOR_GET_ELEMSIZE(_vector); int slice_base = STARPU_VECTOR_GET_SLICE_BASE(_vector); int *v = (int *)STARPU_VECTOR_GET_PTR(_vector); int f = (int)(intptr_t)args[0]; int imin = (int)(intptr_t)args[1]; int imax = (int)(intptr_t)args[2]; int i; assert(elemsize == sizeof(v[0])); printf("depth 2 task, entry: vector ptr = %p, slice_base = %d, imin = %d, imax = %d\n", v, slice_base, imin, imax); for (i = imin; i < imax; i++) { assert(i-slice_base>=0); assert(i-slice_base<NX); (v-slice_base)[i] += f; } printf("depth 2 task ending\n"); }
void cublas_codelet_func_5(void *descr[], void *arg) { float dot; struct cg_problem *pb = arg; float *vecd, *vecq; uint32_t size; /* get the vector */ vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]); STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1])); size = STARPU_VECTOR_GET_NX(descr[0]); dot = cublasSdot (size, vecd, 1, vecq, 1); pb->alpha = pb->delta_new / dot; }
/* functions/codelet to fill the bufferss*/ void fill_tmp_buffer(void *buffers[], void *cl_arg) { int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]); int nx = STARPU_VECTOR_GET_NX(buffers[0]); int i; for (i=0; i<nx; i++) tmp[i]=nx+i; }
static void memset_cuda(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream()); }
void memset_cpu(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); memset(ptr, 42, n * sizeof(*ptr)); }
void read_ghost(void *buffers[], void *cl_arg) { int *tmp = (int *) STARPU_VECTOR_GET_PTR(buffers[0]); int nx=STARPU_VECTOR_GET_NX(buffers[0]); int i; for(i=0; i<nx; i++) { assert(tmp[i]==nx+i); } }
void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { TYPE alpha = *((TYPE *)arg); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]); CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1); }
//! [To be included. You should update doxygen if you see this text.] void scal_cpu_func(void *buffers[], void *_args) { unsigned i; float *factor = _args; struct starpu_vector_interface *vector = buffers[0]; unsigned n = STARPU_VECTOR_GET_NX(vector); float *val = (float *)STARPU_VECTOR_GET_PTR(vector); #pragma omp parallel for num_threads(starpu_combined_worker_get_size()) for (i = 0; i < n; i++) val[i] *= *factor; }
void task_region_g(void *buffers[], void *args) { struct starpu_vector_interface *_vector_1 = buffers[0]; int nx1 = STARPU_VECTOR_GET_NX(_vector_1); int *v1 = (int *)STARPU_VECTOR_GET_PTR(_vector_1); struct starpu_vector_interface *_vector_2 = buffers[1]; int nx2 = STARPU_VECTOR_GET_NX(_vector_2); int *v2 = (int *)STARPU_VECTOR_GET_PTR(_vector_2); int f = (int)(intptr_t)args; STARPU_ASSERT(nx1 == nx2); printf("depth 1 task, entry: vector_1 ptr = %p\n", v1); printf("depth 1 task, entry: vector_2 ptr = %p\n", v2); printf("depth 1 task, entry: f = %d\n", f); fprintf(stderr, "cudaMemcpy: -->\n"); cudaMemcpy(v2,v1,nx1*sizeof(*_vector_1), cudaMemcpyDeviceToDevice); fprintf(stderr, "cudaMemcpy: <--\n"); }
void cpu_codelet_func_6(void *descr[], void *arg) { struct cg_problem *pb = arg; float *vecx, *vecd; uint32_t size; /* get the vector */ vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]); size = STARPU_VECTOR_GET_NX(descr[0]); STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1); }
void cublas_codelet_func_7(void *descr[], void *arg) { struct cg_problem *pb = arg; float *vecr, *vecq; uint32_t size; /* get the vector */ vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]); size = STARPU_VECTOR_GET_NX(descr[0]); cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1); }
void cublas_codelet_func_3(void *descr[], void *arg) { struct cg_problem *pb = arg; float dot; float *vec; uint32_t size; /* get the vector */ vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]); size = STARPU_VECTOR_GET_NX(descr[0]); dot = cublasSdot (size, vec, 1, vec, 1); pb->delta_new = dot; pb->delta_0 = dot; }
/* * Dot product codelet */ void dot_cpu_func(void *descr[], void *cl_arg) { long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]); //FPRINTF_MPI(stderr, "Before dot=%ld (adding %d elements...)\n", *dot, n); unsigned i; for (i = 0; i < n; i++) { //FPRINTF_MPI(stderr, "Adding %ld\n", local_x[i]); *dot += local_x[i]; } //FPRINTF_MPI(stderr, "After dot=%ld\n", *dot); }
void axpy_opencl(void *buffers[], void *_args) { TYPE *alpha = _args; int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_VECTOR_GET_NX(buffers[0]); cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(x), &x); err|= clSetKernelArg(kernel, 1, sizeof(y), &y); err|= clSetKernelArg(kernel, 2, sizeof(n), &n); err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
void cublas_codelet_func_8(void *descr[], void *arg) { float dot; struct cg_problem *pb = arg; float *vecr; uint32_t size; /* get the vector */ vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]); size = STARPU_VECTOR_GET_NX(descr[0]); dot = cublasSdot (size, vecr, 1, vecr, 1); pb->delta_old = pb->delta_new; pb->delta_new = dot; pb->beta = pb->delta_new/pb->delta_old; }
void cublas_codelet_func_9(void *descr[], void *arg) { struct cg_problem *pb = arg; float *vecd, *vecr; uint32_t size; /* get the vector */ vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]); vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]); size = STARPU_VECTOR_GET_NX(descr[0]); /* d = beta d */ cublasSscal(size, pb->beta, vecd, 1); /* d = r + d */ cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1); }
void cpu_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { STARPU_SKIP_IF_VALGRIND; unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned *tmp = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]); unsigned nx = STARPU_VECTOR_GET_NX(descr[0]); size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]); memcpy(tmp, v, nx*elemsize); unsigned i; for (i = 0; i < nx; i++) { v[i] = tmp[i] + 1; } }
void dot_cpu_func(void *descr[], void *cl_arg) { float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); DOT_TYPE local_dot = 0.0; unsigned i; for (i = 0; i < n; i++) { local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i]; } *dot = *dot + local_dot; }
void cpu_codelet_func_3(void *descr[], void *arg) { struct cg_problem *pb = arg; float dot; float *vec; int size; /* get the vector */ vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]); size = (int)STARPU_VECTOR_GET_NX(descr[0]); dot = STARPU_SDOT(size, vec, 1, vec, 1); fprintf(stderr, "func 3 : DOT = %f\n", dot); pb->delta_new = dot; pb->delta_0 = dot; }
void dot_cuda_func(void *descr[], void *cl_arg) { DOT_TYPE current_dot; DOT_TYPE local_dot; float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemcpyAsync(¤t_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1); /* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */ current_dot += local_dot; cudaMemcpyAsync(dot, ¤t_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); }
void vector_scal_opencl(void *buffers[], void *_args) { float *factor = _args; int id, devid, err; cl_kernel kernel; cl_command_queue queue; cl_event event; /* length of the vector */ unsigned n = STARPU_VECTOR_GET_NX(buffers[0]); /* OpenCL copy of the vector pointer */ cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl", devid); /* Name of the codelet defined above */ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); err |= clSetKernelArg(kernel, 1, sizeof(n), &n); err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=1; size_t local=1; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); }
void minmax_cpu_func(void *descr[], void *cl_arg) { /* The array containing the values */ TYPE *local_array = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *minmax = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); TYPE local_min = minmax[0]; TYPE local_max = minmax[1]; /* Compute the min and the max elements in the array */ unsigned i; for (i = 0; i < n; i++) { TYPE val = local_array[i]; local_min = STARPU_MIN(local_min, val); local_max = STARPU_MAX(local_max, val); } minmax[0] = local_min; minmax[1] = local_max; }
#define CUBLASAXPY cublasSaxpy #define N (16*1024*1024) #define NBLOCKS 8 TYPE *vec_x, *vec_y; /* descriptors for StarPU */ starpu_data_handle handle_y, handle_x; void axpy_cpu(void *descr[], __attribute__((unused)) void *arg) { TYPE alpha = *((TYPE *)arg); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]); AXPY((int)n, alpha, block_x, 1, block_y, 1); } #ifdef STARPU_USE_CUDA void axpy_gpu(void *descr[], __attribute__((unused)) void *arg) { TYPE alpha = *((TYPE *)arg); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
void task_region_g(void *buffers[], void *args) { struct starpu_vector_interface *_vector = buffers[0]; int nx = STARPU_VECTOR_GET_NX(_vector); int *v = (int *)STARPU_VECTOR_GET_PTR(_vector); int f = (int)(intptr_t)args; printf("depth 1 task, entry: vector ptr = %p\n", v); { int i; for (i = 0; i < nx; i++) { v[i] += f; } } { const int half_nx = nx/2; starpu_data_handle_t vector_slice_1_handle; starpu_vector_data_register(&vector_slice_1_handle, STARPU_MAIN_RAM, (uintptr_t)&v[0], half_nx, sizeof(v[0])); printf("depth 1 task, block 1: vector_slice_1_handle = %p\n", vector_slice_1_handle); starpu_data_handle_t vector_slice_2_handle; starpu_vector_data_register(&vector_slice_2_handle, STARPU_MAIN_RAM, (uintptr_t)&v[half_nx], nx-half_nx, sizeof(v[0])); /* set slice base */ starpu_omp_vector_annotate(vector_slice_2_handle, half_nx); printf("depth 1 task, block 1: vector_slice_2_handle = %p\n", vector_slice_2_handle); } void *cl_arg_1[3]; void *cl_arg_2[3]; { struct starpu_omp_task_region_attr attr; const int half_nx = nx/2; int i; starpu_data_handle_t vector_slice_1_handle = starpu_data_lookup(&v[0]); printf("depth 1 task, block 2: vector_slice_1_handle = %p\n", vector_slice_1_handle); starpu_data_handle_t vector_slice_2_handle = starpu_data_lookup(&v[half_nx]); printf("depth 1 task, block 2: vector_slice_2_handle = %p\n", vector_slice_2_handle); memset(&attr, 0, sizeof(attr)); attr.cl.cpu_funcs[0] = task_region_h; attr.cl.where = STARPU_CPU; attr.cl.nbuffers = 1; attr.cl.modes[0] = STARPU_RW; attr.cl_arg_size = 3*sizeof(void *); attr.cl_arg_free = 0; attr.if_clause = 1; attr.final_clause = 0; attr.untied_clause = 1; attr.mergeable_clause = 0; i = 0; cl_arg_1[0] = (void *)(intptr_t)i++; cl_arg_1[1] = (void *)(intptr_t)0; cl_arg_1[2] = (void *)(intptr_t)half_nx; attr.cl_arg = cl_arg_1; attr.handles = &vector_slice_1_handle; starpu_omp_task_region(&attr); cl_arg_2[0] = (void *)(intptr_t)i++; cl_arg_2[1] = (void *)(intptr_t)half_nx; cl_arg_2[2] = (void *)(intptr_t)nx; attr.cl_arg = cl_arg_2; attr.handles = &vector_slice_2_handle; starpu_omp_task_region(&attr); } starpu_omp_taskwait(); }
#include <stdlib.h> #define NLOOPS 1000 #define VECTORSIZE 1024 static starpu_data_handle v_handle; /* * Memset */ #ifdef STARPU_USE_CUDA static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args) { char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned length = STARPU_VECTOR_GET_NX(descr[0]); cudaMemset(buf, 42, length); cudaThreadSynchronize(); } #endif static void cpu_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args) { char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned length = STARPU_VECTOR_GET_NX(descr[0]); memset(buf, 42, length); } static starpu_codelet memset_cl = {