static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel) { int ret; struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel)); task->cl = &cl21; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); if (j == k+1) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel)); } else { starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_STRSM(n, n); ret = starpu_task_submit(task); if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return ret; }
static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j) { int ret; struct starpu_task *task = create_task(TAG21(k, j)); task->cl = &cl21; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); if (!no_prio && (j == k+1)) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j)); } else { starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k)); } ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); }
static void create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned i) { int ret; /* printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */ struct starpu_task *task = create_task(TAG12(k, i)); task->cl = &cl12; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); task->handles[1] = starpu_data_get_sub_data(dataA, 2, i, k); if (!no_prio && (i == k+1)) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG12(k, i), 2, TAG11(k), TAG22(k-1, i, k)); } else { starpu_tag_declare_deps(TAG12(k, i), 1, TAG11(k)); } ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); }
int main(int argc, char **argv) { int ret; starpu_init(NULL); starpu_data_malloc_pinned_if_possible((void **)&buffer, VECTORSIZE); starpu_vector_data_register(&v_handle, 0, (uintptr_t)buffer, VECTORSIZE, sizeof(char)); struct starpu_data_filter f = { .filter_func = starpu_vector_divide_in_2_filter_func, /* there are only 2 children */ .nchildren = 2, /* the length of the first part */ .filter_arg = VECTORSIZE/2, .get_nchildren = NULL, .get_child_ops = NULL }; unsigned iter; for (iter = 0; iter < NITER; iter++) { starpu_data_map_filters(v_handle, 1, &f); ret = use_handle(starpu_data_get_sub_data(v_handle, 1, 0)); if (ret == -ENODEV) goto enodev; ret = use_handle(starpu_data_get_sub_data(v_handle, 1, 1)); if (ret == -ENODEV) goto enodev; starpu_task_wait_for_all(); starpu_data_unpartition(v_handle, 0); ret = use_handle(v_handle); if (ret == -ENODEV) goto enodev; starpu_task_wait_for_all(); } starpu_data_unregister(v_handle); starpu_shutdown(); return 0; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ return 0; }
static void launch_codelets(void) { #ifdef STARPU_USE_FXT _starpu_fxt_register_thread(0); #endif /* partition the work into slices */ unsigned taskx, tasky; srand(time(NULL)); /* should we use a single performance model for all archs and use an * acceleration factor ? */ if (use_common_model) { cl.model = &sgemm_model_common; } else { cl.model = &sgemm_model; } for (taskx = 0; taskx < nslicesx; taskx++) { for (tasky = 0; tasky < nslicesy; tasky++) { /* A B[task] = C[task] */ struct starpu_task *task = starpu_task_create(); task->cl = &cl; task->cl_arg = &conf; task->cl_arg_size = sizeof(struct block_conf); task->callback_func = callback_func; task->callback_arg = NULL; starpu_tag_t tag = TAG(taskx, tasky); task->use_tag = 1; task->tag_id = tag; task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky); task->buffers[0].mode = STARPU_R; task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx); task->buffers[1].mode = STARPU_R; task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky); task->buffers[2].mode = STARPU_RW; starpu_task_submit(task, NULL); } } }
static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k) { /* FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */ struct starpu_task *task = create_task(TAG11(k)); task->cl = &cl11; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); /* this is an important task */ if (!noprio) task->priority = STARPU_MAX_PRIO; /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SPOTRF(n); return task; }
static int create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned j) { int ret; struct starpu_task *task = starpu_task_create(); task->cl = &cl12; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); task->handles[1] = starpu_data_get_sub_data(dataA, 2, j, k); task->tag_id = TAG12(k,j); if (!no_prio && (j == k+1)) task->priority = STARPU_MAX_PRIO; ret = starpu_task_submit(task); if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return ret; }
static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j) { /* FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */ struct starpu_task *task = create_task(TAG22(k, i, j)); task->cl = &cl22; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j); if (!noprio && (i == k + 1) && (j == k +1) ) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j)); } else { starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SGEMM(n, n, n); int ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } }
static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel) { int ret; /* FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */ struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel)); task->cl = &cl22; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j); if ( (i == k + 1) && (j == k +1) ) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel)); } else { starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SGEMM(n, n, n); ret = starpu_task_submit(task); if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return ret; }
static int create_task_11(starpu_data_handle_t dataA, unsigned k) { int ret; struct starpu_task *task = starpu_task_create(); task->cl = &cl11; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); task->tag_id = TAG11(k); /* this is an important task */ if (!no_prio) task->priority = STARPU_MAX_PRIO; ret = starpu_task_submit(task); if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return ret; }
int main(int argc, char **argv) { int ret; assert(HEIGHT % (2*BLOCK_HEIGHT) == 0); assert(HEIGHT % FACTOR == 0); parse_args(argc, argv); /* fprintf(stderr, "Reading input file ...\n"); */ /* how many frames ? */ struct stat stbuf; stat(filename_in, &stbuf); size_t filesize = stbuf.st_size; unsigned nframes = filesize/FRAMESIZE; /* fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */ assert((filesize % sizeof(struct yuv_frame)) == 0); struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE); assert(yuv_in_buffer); /* fprintf(stderr, "Alloc output file ...\n"); */ struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE); assert(yuv_out_buffer); /* fetch input data */ FILE *f_in = fopen(filename_in, "r"); assert(f_in); /* allocate room for an output buffer */ FILE *f_out = fopen(filename_out, "w+"); assert(f_out); fread(yuv_in_buffer, FRAMESIZE, nframes, f_in); starpu_data_handle_t *frame_y_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *frame_u_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *frame_v_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_y_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_u_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_v_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* register and partition all layers */ unsigned frame; for (frame = 0; frame < nframes; frame++) { /* register Y layer */ starpu_matrix_data_register(&frame_y_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].y, WIDTH, WIDTH, HEIGHT, sizeof(uint8_t)); starpu_data_partition(frame_y_handle[frame], &filter_y); starpu_matrix_data_register(&new_frame_y_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].y, NEW_WIDTH, NEW_WIDTH, NEW_HEIGHT, sizeof(uint8_t)); starpu_data_partition(new_frame_y_handle[frame], &filter_y); /* register U layer */ starpu_matrix_data_register(&frame_u_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].u, WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(frame_u_handle[frame], &filter_uv); starpu_matrix_data_register(&new_frame_u_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].u, NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(new_frame_u_handle[frame], &filter_uv); /* register V layer */ starpu_matrix_data_register(&frame_v_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].v, WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(frame_v_handle[frame], &filter_uv); starpu_matrix_data_register(&new_frame_v_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].v, NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(new_frame_v_handle[frame], &filter_uv); } /* how many tasks are there ? */ unsigned nblocks_y = filter_y.nchildren; unsigned nblocks_uv = filter_uv.nchildren; unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes; fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes); start = starpu_timing_now(); /* do the computation */ for (frame = 0; frame < nframes; frame++) { unsigned blocky; for (blocky = 0; blocky < nblocks_y; blocky++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_y_handle[frame], 1, blocky); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_y_handle[frame], 1, blocky); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } unsigned blocku; for (blocku = 0; blocku < nblocks_uv; blocku++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_u_handle[frame], 1, blocku); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_u_handle[frame], 1, blocku); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } unsigned blockv; for (blockv = 0; blockv < nblocks_uv; blockv++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_v_handle[frame], 1, blockv); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_v_handle[frame], 1, blockv); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } } /* make sure all output buffers are sync'ed */ for (frame = 0; frame < nframes; frame++) { starpu_data_unregister(frame_y_handle[frame]); starpu_data_unregister(frame_u_handle[frame]); starpu_data_unregister(frame_v_handle[frame]); starpu_data_unregister(new_frame_y_handle[frame]); starpu_data_unregister(new_frame_u_handle[frame]); starpu_data_unregister(new_frame_v_handle[frame]); } /* There is an implicit barrier: the unregister methods will block * until the computation is done and that the result was put back into * memory. */ end = starpu_timing_now(); double timing = end - start; printf("# s\tFPS\n"); printf("%f\t%f\n", timing/1000000, (1000000*nframes)/timing); fwrite(yuv_out_buffer, NEW_FRAMESIZE, nframes, f_out); /* partition the layers into smaller parts */ starpu_shutdown(); if (fclose(f_in) != 0) fprintf(stderr, "Could not close %s properly\n", filename_in); if (fclose(f_out) != 0) fprintf(stderr, "Could not close %s properly\n", filename_out); return 0; }
int main(int argc, char **argv) { unsigned *foo; starpu_data_handle_t handle; int ret; unsigned n, i, size; ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); #endif n = starpu_worker_get_count(); if (n == 1) { starpu_shutdown(); return STARPU_TEST_SKIPPED; } size = 10 * n; foo = (unsigned *) calloc(size, sizeof(*foo)); for (i = 0; i < size; i++) foo[i] = i; starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)foo, size, sizeof(*foo)); /* Broadcast the data to force in-place partitioning */ for (i = 0; i < n; i++) starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0); struct starpu_data_filter f = { .filter_func = starpu_vector_filter_block, .nchildren = n, }; starpu_data_partition(handle, &f); for (i = 0; i < f.nchildren; i++) { struct starpu_task *task = starpu_task_create(); task->handles[0] = starpu_data_get_sub_data(handle, 1, i); task->cl = &scal_codelet; task->execute_on_a_specific_worker = 1; task->workerid = i; ret = starpu_task_submit(task); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } ret = starpu_task_wait_for_all(); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all"); starpu_data_unpartition(handle, STARPU_MAIN_RAM); starpu_data_unregister(handle); starpu_shutdown(); ret = EXIT_SUCCESS; for (i = 0; i < size; i++) { if (foo[i] != i*2) { FPRINTF(stderr,"value %u is %u instead of %u\n", i, foo[i], 2*i); ret = EXIT_FAILURE; } } return ret; enodev: starpu_data_unregister(handle); fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ starpu_shutdown(); return STARPU_TEST_SKIPPED; }
int main(int argc, char **argv) { int ret, exit_value = 0; /* Initialize StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_load_opencl_from_file("examples/axpy/axpy_opencl_kernel.cl", &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); #endif starpu_cublas_init(); /* This is equivalent to vec_a = malloc(N*sizeof(TYPE)); vec_b = malloc(N*sizeof(TYPE)); */ starpu_malloc((void **)&_vec_x, N*sizeof(TYPE)); assert(_vec_x); starpu_malloc((void **)&_vec_y, N*sizeof(TYPE)); assert(_vec_y); unsigned i; for (i = 0; i < N; i++) { _vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */ _vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */ } FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", _vec_x[0]); FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", _vec_y[0]); /* Declare the data to StarPU */ starpu_vector_data_register(&_handle_x, STARPU_MAIN_RAM, (uintptr_t)_vec_x, N, sizeof(TYPE)); starpu_vector_data_register(&_handle_y, STARPU_MAIN_RAM, (uintptr_t)_vec_y, N, sizeof(TYPE)); /* Divide the vector into blocks */ struct starpu_data_filter block_filter = { .filter_func = starpu_vector_filter_block, .nchildren = NBLOCKS }; starpu_data_partition(_handle_x, &block_filter); starpu_data_partition(_handle_y, &block_filter); double start; double end; start = starpu_timing_now(); unsigned b; for (b = 0; b < NBLOCKS; b++) { struct starpu_task *task = starpu_task_create(); task->cl = &axpy_cl; task->cl_arg = &_alpha; task->cl_arg_size = sizeof(_alpha); task->handles[0] = starpu_data_get_sub_data(_handle_x, 1, b); task->handles[1] = starpu_data_get_sub_data(_handle_y, 1, b); task->tag_id = b; ret = starpu_task_submit(task); if (ret == -ENODEV) { exit_value = 77; goto enodev; } STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); enodev: starpu_data_unpartition(_handle_x, STARPU_MAIN_RAM); starpu_data_unpartition(_handle_y, STARPU_MAIN_RAM); starpu_data_unregister(_handle_x); starpu_data_unregister(_handle_y); end = starpu_timing_now(); double timing = end - start; FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing); FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[0], _alpha); if (exit_value != 77) exit_value = check(); starpu_free((void *)_vec_x); starpu_free((void *)_vec_y); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); #endif /* Stop StarPU */ starpu_shutdown(); return exit_value; }
starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp, unsigned nblocks STARPU_ATTRIBUTE_UNUSED, unsigned j, unsigned i) { /* we use filters */ return starpu_data_get_sub_data(*dataAp, 2, j, i); }
starpu_data_handle get_block_with_striding(starpu_data_handle *dataAp, unsigned nblocks __attribute__((unused)), unsigned j, unsigned i) { /* we use filters */ return starpu_data_get_sub_data(*dataAp, 2, j, i); }
int main(int argc, char **argv) { double start, end; int ret; parse_args(argc, argv); #ifdef STARPU_QUICK_CHECK niter /= 10; #endif ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); starpu_cublas_init(); init_problem_data(); partition_mult_data(); if (bound) starpu_bound_start(0, 0); start = starpu_timing_now(); unsigned x, y, iter; for (iter = 0; iter < niter; iter++) { for (x = 0; x < nslicesx; x++) for (y = 0; y < nslicesy; y++) { struct starpu_task *task = starpu_task_create(); task->cl = &cl; task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y); task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x); task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y); task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * zdim; ret = starpu_task_submit(task); if (ret == -ENODEV) { ret = 77; goto enodev; } STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); } end = starpu_timing_now(); if (bound) starpu_bound_stop(); double timing = end - start; double min, min_int; double flops = 2.0*((unsigned long long)niter)*((unsigned long long)xdim) *((unsigned long long)ydim)*((unsigned long long)zdim); if (bound) starpu_bound_compute(&min, &min_int, 1); PRINTF("# x\ty\tz\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops\tTims\tTiGFlops"); PRINTF("\n"); PRINTF("%u\t%u\t%u\t%.0f\t%.1f", xdim, ydim, zdim, timing/niter/1000.0, flops/timing/1000.0); if (bound) PRINTF("\t%.0f\t%.1f\t%.0f\t%.1f", min, flops/min/1000000.0, min_int, flops/min_int/1000000.0); PRINTF("\n"); enodev: starpu_data_unpartition(C_handle, STARPU_MAIN_RAM); starpu_data_unpartition(B_handle, STARPU_MAIN_RAM); starpu_data_unpartition(A_handle, STARPU_MAIN_RAM); starpu_data_unregister(A_handle); starpu_data_unregister(B_handle); starpu_data_unregister(C_handle); if (check) check_output(); starpu_free(A); starpu_free(B); starpu_free(C); starpu_cublas_shutdown(); starpu_shutdown(); return ret; }
static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; int ret; /* create all the DAG nodes */ unsigned i,j,k; if (bound) starpu_bound_start(bounddeps, boundprio); start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { ret = create_task_11(dataA, k); if (ret == -ENODEV) return ret; for (i = k+1; i<nblocks; i++) { ret = create_task_12(dataA, k, i); if (ret == -ENODEV) return ret; ret = create_task_21(dataA, k, i); if (ret == -ENODEV) return ret; } starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, k)); for (i = k+1; i<nblocks; i++) for (j = k+1; j<nblocks; j++) { ret = create_task_22(dataA, k, i, j); if (ret == -ENODEV) return ret; } for (i = k+1; i<nblocks; i++) { starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i)); starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k)); } } /* stall the application until the end of computations */ starpu_task_wait_for_all(); end = starpu_timing_now(); if (bound) starpu_bound_stop(); double timing = end - start; unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); return 0; }
void launch_spmv_codelets(void) { struct starpu_task *task_tab; uint8_t *is_entry_tab; int ret; /* we call one codelet per block */ unsigned nblocks = starpu_bcsr_get_nnz(sparse_matrix); unsigned nrows = starpu_bcsr_get_nrow(sparse_matrix); remainingtasks = NSPMV*nblocks; totaltasks = remainingtasks; unsigned taskid = 0; task_tab = calloc(totaltasks, sizeof(struct starpu_task)); STARPU_ASSERT(task_tab); is_entry_tab = calloc(totaltasks, sizeof(uint8_t)); STARPU_ASSERT(is_entry_tab); printf("there will be %d codelets\n", remainingtasks); uint32_t *rowptr = starpu_bcsr_get_local_rowptr(sparse_matrix); uint32_t *colind = starpu_bcsr_get_local_colind(sparse_matrix); start = starpu_timing_now(); unsigned loop; for (loop = 0; loop < NSPMV; loop++) { unsigned row; unsigned part = 0; for (row = 0; row < nrows; row++) { unsigned index; if (rowptr[row] == rowptr[row+1]) { continue; } for (index = rowptr[row]; index < rowptr[row+1]; index++, part++) { struct starpu_task *task = &task_tab[taskid]; starpu_task_init(task); task->use_tag = 1; task->tag_id = taskid; task->callback_func = init_problem_callback; task->callback_arg = &remainingtasks; task->cl = &cl; task->cl_arg = NULL; unsigned i = colind[index]; unsigned j = row; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = starpu_data_get_sub_data(vector_in, 1, i); task->handles[2] = starpu_data_get_sub_data(vector_out, 1, j); /* all tasks in the same row are dependant so that we don't wait too much for data * we need to wait on the previous task if we are not the first task of a row */ if (index != rowptr[row & ~0x3]) { /* this is not the first task in the row */ starpu_tag_declare_deps((starpu_tag_t)taskid, 1, (starpu_tag_t)(taskid-1)); is_entry_tab[taskid] = 0; } else { /* this is an entry task */ is_entry_tab[taskid] = 1; } taskid++; } } } printf("start submitting tasks !\n"); /* submit ALL tasks now */ unsigned nchains = 0; unsigned task; for (task = 0; task < totaltasks; task++) { if (is_entry_tab[task]) { nchains++; } ret = starpu_task_submit(&task_tab[task]); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } printf("end of task submission (there was %d chains for %d tasks : ratio %d tasks per chain) !\n", nchains, totaltasks, totaltasks/nchains); }
int main(int argc, char **argv) { int ret; unsigned part; double timing; double start, end; unsigned row, pos; unsigned ind; /* CSR matrix description */ float *nzval; uint32_t nnz; uint32_t *colind; uint32_t *rowptr; /* Input and Output vectors */ float *vector_in_ptr; float *vector_out_ptr; /* * Parse command-line arguments */ parse_args(argc, argv); /* * Launch StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* * Create a 3-band sparse matrix as input example */ nnz = 3*size-2; starpu_malloc((void **)&nzval, nnz*sizeof(float)); starpu_malloc((void **)&colind, nnz*sizeof(uint32_t)); starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t)); assert(nzval && colind && rowptr); /* fill the matrix */ for (row = 0, pos = 0; row < size; row++) { rowptr[row] = pos; if (row > 0) { nzval[pos] = 1.0f; colind[pos] = row-1; pos++; } nzval[pos] = 5.0f; colind[pos] = row; pos++; if (row < size - 1) { nzval[pos] = 1.0f; colind[pos] = row+1; pos++; } } STARPU_ASSERT(pos == nnz); rowptr[size] = nnz; /* initiate the 2 vectors */ starpu_malloc((void **)&vector_in_ptr, size*sizeof(float)); starpu_malloc((void **)&vector_out_ptr, size*sizeof(float)); assert(vector_in_ptr && vector_out_ptr); /* fill them */ for (ind = 0; ind < size; ind++) { vector_in_ptr[ind] = 2.0f; vector_out_ptr[ind] = 0.0f; } /* * Register the CSR matrix and the 2 vectors */ starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float)); starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float)); starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float)); /* * Partition the CSR matrix and the output vector */ csr_f.nchildren = nblocks; vector_f.nchildren = nblocks; starpu_data_partition(sparse_matrix, &csr_f); starpu_data_partition(vector_out, &vector_f); /* * If we use OpenCL, we need to compile the SpMV kernel */ #ifdef STARPU_USE_OPENCL compile_spmv_opencl_kernel(); #endif start = starpu_timing_now(); /* * Create and submit StarPU tasks */ for (part = 0; part < nblocks; part++) { struct starpu_task *task = starpu_task_create(); task->cl = &spmv_cl; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = vector_in; task->handles[2] = starpu_data_get_sub_data(vector_out, 1, part); ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } starpu_task_wait_for_all(); end = starpu_timing_now(); /* * Unregister the CSR matrix and the output vector */ starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); starpu_data_unpartition(vector_out, STARPU_MAIN_RAM); /* * Unregister data */ starpu_data_unregister(sparse_matrix); starpu_data_unregister(vector_in); starpu_data_unregister(vector_out); /* * Display the result */ for (row = 0; row < STARPU_MIN(size, 16); row++) { FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]); } starpu_free(nzval); starpu_free(colind); starpu_free(rowptr); starpu_free(vector_in_ptr); starpu_free(vector_out_ptr); /* * Stop StarPU */ starpu_shutdown(); timing = end - start; FPRINTF(stderr, "Computation took (in ms)\n"); FPRINTF(stdout, "%2.2f\n", timing/1000); return 0; }