void redux_cpu_func(void *descr[], void *cl_arg) { DOT_TYPE *dota = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); *dota = *dota + *dotb; }
void func_cpu_noargs(void *descr[], void *_args) { int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]); float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]); *x0 = *x0 * _ifactor; *x1 = *x1 * _ffactor; }
void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]); int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]); *x0 += 1; *x1 *= *x1; }
void redux_cpu_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]); unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]); *dst = *dst + *src; }
/* * Codelet to perform the reduction of two elements */ void redux_cpu_func(void *descr[], void *cl_arg) { long int *dota = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]); long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]); *dota = *dota + *dotb; FPRINTF_MPI(stderr, "Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb); }
void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { float *xy = (float *)STARPU_VARIABLE_GET_PTR(descr[0]); float *xm1y = (float *)STARPU_VARIABLE_GET_PTR(descr[1]); float *xp1y = (float *)STARPU_VARIABLE_GET_PTR(descr[2]); float *xym1 = (float *)STARPU_VARIABLE_GET_PTR(descr[3]); float *xyp1 = (float *)STARPU_VARIABLE_GET_PTR(descr[4]); // fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1); *xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5; // fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1); }
void func_cpu_args(void *descr[], void *_args) { int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]); float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]); int ifactor; float ffactor; starpu_codelet_unpack_args(_args, &ifactor, &ffactor); *x0 = *x0 * ifactor; *x1 = *x1 * ffactor; }
void minmax_redux_cpu_func(void *descr[], void *cl_arg) { TYPE *array_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); TYPE *array_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); /* Compute the min value */ TYPE min_dst = array_dst[0]; TYPE min_src = array_src[0]; array_dst[0] = STARPU_MIN(min_dst, min_src); /* Compute the max value */ TYPE max_dst = array_dst[1]; TYPE max_src = array_src[1]; array_dst[1] = STARPU_MAX(max_dst, max_src); }
void neutral_cpu_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]); *dst = 0; }
int convert_variable_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) { *ptr = STARPU_VARIABLE_GET_PTR(interface); (*ss).size = STARPU_VARIABLE_GET_ELEMSIZE(interface); return 0; }
void cpu_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]); *val += 1; printf("task executing \n"); }
void func_cpu(void *descr[], void *args) { float *x = (float *) STARPU_VARIABLE_GET_PTR(descr[0]); float factor; factor = *(float *) args; *x *= factor; }
static void redux_cuda_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]); unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]); unsigned host_dst, host_src; /* This is a dummy technique of course */ cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); host_dst += host_src; cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); }
void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args) { int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]); int rank; starpu_codelet_unpack_args(_args, &rank); FPRINTF_MPI(stderr, "Executing codelet with value %d and rank %d\n", *value, rank); STARPU_ASSERT_MSG(*value == rank, "Received value %d is not the expected value %d\n", *value, rank); }
uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle) { unsigned node; node = _starpu_memory_node_get_local_key(); STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); return STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node)); }
static void neutral_cuda_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]); /* This is a dummy technique of course */ unsigned host_dst = 0; cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); }
void minmax_neutral_cpu_func(void *descr[], void *cl_arg) { TYPE *array = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); /* Initialize current min to the greatest possible value. */ array[0] = TYPE_MAX; /* Initialize current max to the smallest possible value. */ array[1] = TYPE_MIN; }
void redux_opencl_func(void *buffers[], void *args) { int id, devid; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]); cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(dota), &dota); err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local=global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } starpu_opencl_release_kernel(kernel); }
static void redux_opencl_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned h_dst, h_src; cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]); cl_mem d_src = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[1]); cl_command_queue queue; starpu_opencl_get_current_queue(&queue); /* This is a dummy technique of course */ clEnqueueReadBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); clEnqueueReadBuffer(queue, d_src, CL_TRUE, 0, sizeof(unsigned), (void *)&h_src, 0, NULL, NULL); h_dst += h_src; clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); }
static void neutral_opencl_kernel(void *descr[], void *arg) { STARPU_SKIP_IF_VALGRIND; unsigned h_dst = 0; cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]); cl_command_queue queue; starpu_opencl_get_current_queue(&queue); clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); }
/* * Dot product codelet */ void dot_cpu_func(void *descr[], void *cl_arg) { long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]); //FPRINTF_MPI(stderr, "Before dot=%ld (adding %d elements...)\n", *dot, n); unsigned i; for (i = 0; i < n; i++) { //FPRINTF_MPI(stderr, "Adding %ld\n", local_x[i]); *dot += local_x[i]; } //FPRINTF_MPI(stderr, "After dot=%ld\n", *dot); }
static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *interface) { unsigned node; for (node = 0; node < STARPU_MAXNODES; node++) { starpu_variable_interface_t *local_interface = starpu_data_get_interface_on_node(handle, node); if (node == home_node) { local_interface->ptr = STARPU_VARIABLE_GET_PTR(interface); } else { local_interface->ptr = 0; } local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(interface); } }
void dot_cpu_func(void *descr[], void *cl_arg) { float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); DOT_TYPE local_dot = 0.0; unsigned i; for (i = 0; i < n; i++) { local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i]; } *dot = *dot + local_dot; }
void init_opencl_func(void *buffers[], void *args) { cl_int err; cl_command_queue queue; cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]); starpu_opencl_get_current_queue(&queue); DOT_TYPE zero = (DOT_TYPE) 0.0; err = clEnqueueWriteBuffer(queue, dot, CL_TRUE, 0, sizeof(DOT_TYPE), &zero, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); }
void dot_cuda_func(void *descr[], void *cl_arg) { DOT_TYPE current_dot; DOT_TYPE local_dot; float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]); float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]); DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); cudaMemcpyAsync(¤t_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1); /* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */ current_dot += local_dot; cudaMemcpyAsync(dot, ¤t_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); }
void minmax_cpu_func(void *descr[], void *cl_arg) { /* The array containing the values */ TYPE *local_array = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *minmax = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); TYPE local_min = minmax[0]; TYPE local_max = minmax[1]; /* Compute the min and the max elements in the array */ unsigned i; for (i = 0; i < n; i++) { TYPE val = local_array[i]; local_min = STARPU_MIN(local_min, val); local_max = STARPU_MAX(local_max, val); } minmax[0] = local_min; minmax[1] = local_max; }
static void *variable_handle_to_pointer(starpu_data_handle_t handle, unsigned node) { STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); return (void*) STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node)); }
/* * Codelet to create a neutral element */ void init_cpu_func(void *descr[], void *cl_arg) { long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]); *dot = 0; FPRINTF_MPI(stderr, "Init dot\n"); }
void init_cuda_func(void *descr[], void *cl_arg) { DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream()); }
void init_cpu_func(void *descr[], void *cl_arg) { DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); *dot = 0.0f; }