/* This filter function takes a CSR matrix, and divides it into nparts with the * same number of rows. */ static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts) { struct starpu_csr_interface *csr_father = (struct starpu_csr_interface *) father_interface; struct starpu_csr_interface *csr_child = (struct starpu_csr_interface *) child_interface; uint32_t nrow = csr_father->nrow; size_t elemsize = csr_father->elemsize; uint32_t firstentry = csr_father->firstentry; /* Every sub-parts should contain the same number of non-zero entries */ uint32_t chunk_size = (nrow + nparts - 1)/nparts; uint32_t *rowptr = csr_father->rowptr; uint32_t first_index = id*chunk_size - firstentry; uint32_t local_firstentry = rowptr[first_index]; uint32_t child_nrow = STARPU_MIN(chunk_size, nrow - id*chunk_size); uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; csr_child->nnz = local_nnz; csr_child->nrow = child_nrow; csr_child->firstentry = local_firstentry; csr_child->elemsize = elemsize; if (csr_father->nzval) { csr_child->rowptr = &csr_father->rowptr[first_index]; csr_child->colind = &csr_father->colind[local_firstentry]; csr_child->nzval = csr_father->nzval + local_firstentry * elemsize; } }
void _starpu_timing_init(void) { static starpu_tick_t t1, t2; int i; if (inited) return; residual = (unsigned long long)1 << 63; for(i = 0; i < 20; i++) { STARPU_GET_TICK(t1); STARPU_GET_TICK(t2); residual = STARPU_MIN(residual, TICK_RAW_DIFF(t1, t2)); } { struct timeval tv1,tv2; STARPU_GET_TICK(t1); gettimeofday(&tv1,0); usleep(500000); STARPU_GET_TICK(t2); gettimeofday(&tv2,0); scale = ((tv2.tv_sec*1e6 + tv2.tv_usec) - (tv1.tv_sec*1e6 + tv1.tv_usec)) / (double)(TICK_DIFF(t1, t2)); } STARPU_GET_TICK(reference_start_tick); inited = 1; }
void print_results(void) { unsigned row; for (row = 0; row < STARPU_MIN(size, 16); row++) { printf("%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]); } }
static double find_list_min(double *y, unsigned n) { double min = 1.0e30; unsigned i; for (i = 0; i < n; i++) { min = STARPU_MIN(min, y[i]); } return min; }
void minmax_redux_cpu_func(void *descr[], void *cl_arg) { TYPE *array_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); TYPE *array_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); /* Compute the min value */ TYPE min_dst = array_dst[0]; TYPE min_src = array_src[0]; array_dst[0] = STARPU_MIN(min_dst, min_src); /* Compute the max value */ TYPE max_dst = array_dst[1]; TYPE max_src = array_src[1]; array_dst[1] = STARPU_MAX(max_dst, max_src); }
static int _compar_data_paths(const unsigned pathA[], unsigned depthA, const unsigned pathB[], unsigned depthB) { unsigned level; unsigned depth = STARPU_MIN(depthA, depthB); for (level = 0; level < depth; level++) { if (pathA[level] != pathB[level]) return (pathA[level] < pathB[level])?-1:1; } /* If this is the same path */ if (depthA == depthB) return 0; /* A is a subdata of B or B is a subdata of A, so the smallest one is * the father of the other (we take this convention). */ return (depthA < depthB)?-1:1; }
void minmax_cpu_func(void *descr[], void *cl_arg) { /* The array containing the values */ TYPE *local_array = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]); unsigned n = STARPU_VECTOR_GET_NX(descr[0]); TYPE *minmax = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]); TYPE local_min = minmax[0]; TYPE local_max = minmax[1]; /* Compute the min and the max elements in the array */ unsigned i; for (i = 0; i < n; i++) { TYPE val = local_array[i]; local_min = STARPU_MIN(local_min, val); local_max = STARPU_MAX(local_max, val); } minmax[0] = local_min; minmax[1] = local_max; }
static void cpu_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg) { TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]); TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]); unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]); unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]); unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]); int worker_size = starpu_combined_worker_get_size(); if (worker_size == 1) { /* Sequential CPU task */ CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC); } else { /* Parallel CPU task */ unsigned rank = starpu_combined_worker_get_rank(); unsigned block_size = (nyC + worker_size - 1)/worker_size; unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank; STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1])); TYPE *new_subB = &subB[block_size*rank]; TYPE *new_subC = &subC[block_size*rank]; CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC); } }
void starpu_block_filter_func_vector(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks) { starpu_vector_interface_t *vector_father = father_interface; starpu_vector_interface_t *vector_child = child_interface; uint32_t nx = vector_father->nx; size_t elemsize = vector_father->elemsize; STARPU_ASSERT(nchunks <= nx); uint32_t chunk_size = (nx + nchunks - 1)/nchunks; size_t offset = id*chunk_size*elemsize; uint32_t child_nx = STARPU_MIN(chunk_size, nx - id*chunk_size); vector_child->nx = child_nx; vector_child->elemsize = elemsize; if (vector_father->ptr) { vector_child->ptr = vector_father->ptr + offset; vector_child->dev_handle = vector_father->dev_handle; vector_child->offset = vector_father->offset + offset; } } void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nchunks) { /* there cannot be more than 2 chunks */
int main(int argc, char **argv) { int ret; unsigned part; double timing; double start, end; unsigned row, pos; unsigned ind; /* CSR matrix description */ float *nzval; uint32_t nnz; uint32_t *colind; uint32_t *rowptr; /* Input and Output vectors */ float *vector_in_ptr; float *vector_out_ptr; /* * Parse command-line arguments */ parse_args(argc, argv); /* * Launch StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* * Create a 3-band sparse matrix as input example */ nnz = 3*size-2; starpu_malloc((void **)&nzval, nnz*sizeof(float)); starpu_malloc((void **)&colind, nnz*sizeof(uint32_t)); starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t)); assert(nzval && colind && rowptr); /* fill the matrix */ for (row = 0, pos = 0; row < size; row++) { rowptr[row] = pos; if (row > 0) { nzval[pos] = 1.0f; colind[pos] = row-1; pos++; } nzval[pos] = 5.0f; colind[pos] = row; pos++; if (row < size - 1) { nzval[pos] = 1.0f; colind[pos] = row+1; pos++; } } STARPU_ASSERT(pos == nnz); rowptr[size] = nnz; /* initiate the 2 vectors */ starpu_malloc((void **)&vector_in_ptr, size*sizeof(float)); starpu_malloc((void **)&vector_out_ptr, size*sizeof(float)); assert(vector_in_ptr && vector_out_ptr); /* fill them */ for (ind = 0; ind < size; ind++) { vector_in_ptr[ind] = 2.0f; vector_out_ptr[ind] = 0.0f; } /* * Register the CSR matrix and the 2 vectors */ starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float)); starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float)); starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float)); /* * Partition the CSR matrix and the output vector */ csr_f.nchildren = nblocks; vector_f.nchildren = nblocks; starpu_data_partition(sparse_matrix, &csr_f); starpu_data_partition(vector_out, &vector_f); /* * If we use OpenCL, we need to compile the SpMV kernel */ #ifdef STARPU_USE_OPENCL compile_spmv_opencl_kernel(); #endif start = starpu_timing_now(); /* * Create and submit StarPU tasks */ for (part = 0; part < nblocks; part++) { struct starpu_task *task = starpu_task_create(); task->cl = &spmv_cl; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = vector_in; task->handles[2] = starpu_data_get_sub_data(vector_out, 1, part); ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } starpu_task_wait_for_all(); end = starpu_timing_now(); /* * Unregister the CSR matrix and the output vector */ starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); starpu_data_unpartition(vector_out, STARPU_MAIN_RAM); /* * Unregister data */ starpu_data_unregister(sparse_matrix); starpu_data_unregister(vector_in); starpu_data_unregister(vector_out); /* * Display the result */ for (row = 0; row < STARPU_MIN(size, 16); row++) { FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]); } starpu_free(nzval); starpu_free(colind); starpu_free(rowptr); starpu_free(vector_in_ptr); starpu_free(vector_out_ptr); /* * Stop StarPU */ starpu_shutdown(); timing = end - start; FPRINTF(stderr, "Computation took (in ms)\n"); FPRINTF(stdout, "%2.2f\n", timing/1000); return 0; }
/* y = ax^b + c * return 0 if success, -1 otherwise * if success, a, b and c are modified * */ int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c) { unsigned n = find_list_size(ptr); unsigned *x = malloc(n*sizeof(unsigned)); STARPU_ASSERT(x); double *y = malloc(n*sizeof(double)); STARPU_ASSERT(y); dump_list(x, y, ptr); double cmin = 0.0; double cmax = find_list_min(y, n); unsigned iter; double err = 100000.0; for (iter = 0; iter < MAXREGITER; iter++) { double c1, c2; double r1, r2; double radius = 0.01; c1 = cmin + (0.5-radius)*(cmax - cmin); c2 = cmin + (0.5+radius)*(cmax - cmin); r1 = test_r(c1, n, x, y); r2 = test_r(c2, n, x, y); double err1, err2; err1 = fabs(1.0 - r1); err2 = fabs(1.0 - r2); if (err1 < err2) { cmax = (cmin + cmax)/2; } else { /* 2 is better */ cmin = (cmin + cmax)/2; } if (fabs(err - STARPU_MIN(err1, err2)) < EPS) { err = STARPU_MIN(err1, err2); break; } err = STARPU_MIN(err1, err2); } *c = (cmin + cmax)/2; *b = compute_b(*c, n, x, y); *a = exp(compute_a(*c, *b, n, x, y)); free(x); free(y); return 0; }