int main(int argc, char **argv) { unsigned i; double timing; double start; double end; int ret; parse_args(argc, argv); #ifdef STARPU_HAVE_VALGRIND_H if(RUNNING_ON_VALGRIND) ntasks = 5; #endif ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); struct starpu_task task; starpu_task_init(&task); task.cl = &dummy_codelet; task.detach = 0; FPRINTF(stderr, "#tasks : %u\n", ntasks); start = starpu_timing_now(); for (i = 0; i < ntasks; i++) { ret = starpu_task_submit(&task); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); ret = starpu_task_wait(&task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait"); } end = starpu_timing_now(); timing = end - start; FPRINTF(stderr, "Total: %f secs\n", timing/1000000); FPRINTF(stderr, "Per task: %f usecs\n", timing/ntasks); starpu_task_clean(&task); starpu_shutdown(); return EXIT_SUCCESS; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ starpu_shutdown(); return STARPU_TEST_SKIPPED; }
int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned) { double start; double end; int ret; start = starpu_timing_now(); ret = cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0); end = starpu_timing_now(); double timing = end - start; double flop = (1.0f*size*size*size)/3.0f; PRINTF("# size\tms\tGFlops\n"); PRINTF("%u\t%.0f\t%.1f\n", size, timing/1000, (flop/timing/1000.0f)); return ret; }
void init_problem_callback(void *arg) { unsigned *remaining = arg; unsigned val = STARPU_ATOMIC_ADD(remaining, -1); /* if (val < 10) printf("callback %d remaining \n", val); */ if ( val == 0 ) { printf("DONE ...\n"); end = starpu_timing_now(); sem_post(&sem); } }
int starpu_mct_compute_expected_times(struct starpu_sched_component *component, struct starpu_task *task, double *estimated_lengths, double *estimated_transfer_length, double *estimated_ends_with_task, double *min_exp_end_with_task, double *max_exp_end_with_task, int *suitable_components) { int nsuitable_components = 0; int i; for(i = 0; i < component->nchildren; i++) { struct starpu_sched_component * c = component->children[i]; if(starpu_sched_component_execute_preds(c, task, estimated_lengths + i)) { if(isnan(estimated_lengths[i])) /* The perfmodel had been purged since the task was pushed * onto the mct component. */ continue; /* Estimated availability of worker */ double estimated_end = c->estimated_end(c); double now = starpu_timing_now(); if (estimated_end < now) estimated_end = now; estimated_transfer_length[i] = starpu_sched_component_transfer_length(c, task); estimated_ends_with_task[i] = compute_expected_time(now, estimated_end, estimated_lengths[i], estimated_transfer_length[i]); if(estimated_ends_with_task[i] < *min_exp_end_with_task) *min_exp_end_with_task = estimated_ends_with_task[i]; if(estimated_ends_with_task[i] > *max_exp_end_with_task) *max_exp_end_with_task = estimated_ends_with_task[i]; suitable_components[nsuitable_components++] = i; } } return nsuitable_components; }
int main(int argc, char **argv) { int ret; unsigned i; double timing; double start; double end; #ifdef STARPU_QUICK_CHECK ntasks = 128; #endif parse_args(argc, argv); ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); fprintf(stderr, "#tasks : %u\n", ntasks); start = starpu_timing_now(); for (i = 0; i < ntasks; i++) { ret = inject_one_task(); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } end = starpu_timing_now(); timing = end - start; fprintf(stderr, "Total: %f secs\n", timing/1000000); fprintf(stderr, "Per task: %f usecs\n", timing/ntasks); { char *output_dir = getenv("STARPU_BENCH_DIR"); char *bench_id = getenv("STARPU_BENCH_ID"); if (output_dir && bench_id) { char file[1024]; FILE *f; sprintf(file, "%s/sync_tasks_overhead_total.dat", output_dir); f = fopen(file, "a"); fprintf(f, "%s\t%f\n", bench_id, timing/1000000); fclose(f); sprintf(file, "%s/sync_tasks_overhead_per_task.dat", output_dir); f = fopen(file, "a"); fprintf(f, "%s\t%f\n", bench_id, timing/ntasks); fclose(f); } } starpu_shutdown(); return EXIT_SUCCESS; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ starpu_shutdown(); return STARPU_TEST_SKIPPED; }
void launch_spmv_codelets(void) { struct starpu_task *task_tab; uint8_t *is_entry_tab; int ret; /* we call one codelet per block */ unsigned nblocks = starpu_bcsr_get_nnz(sparse_matrix); unsigned nrows = starpu_bcsr_get_nrow(sparse_matrix); remainingtasks = NSPMV*nblocks; totaltasks = remainingtasks; unsigned taskid = 0; task_tab = calloc(totaltasks, sizeof(struct starpu_task)); STARPU_ASSERT(task_tab); is_entry_tab = calloc(totaltasks, sizeof(uint8_t)); STARPU_ASSERT(is_entry_tab); printf("there will be %d codelets\n", remainingtasks); uint32_t *rowptr = starpu_bcsr_get_local_rowptr(sparse_matrix); uint32_t *colind = starpu_bcsr_get_local_colind(sparse_matrix); start = starpu_timing_now(); unsigned loop; for (loop = 0; loop < NSPMV; loop++) { unsigned row; unsigned part = 0; for (row = 0; row < nrows; row++) { unsigned index; if (rowptr[row] == rowptr[row+1]) { continue; } for (index = rowptr[row]; index < rowptr[row+1]; index++, part++) { struct starpu_task *task = &task_tab[taskid]; starpu_task_init(task); task->use_tag = 1; task->tag_id = taskid; task->callback_func = init_problem_callback; task->callback_arg = &remainingtasks; task->cl = &cl; task->cl_arg = NULL; unsigned i = colind[index]; unsigned j = row; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = starpu_data_get_sub_data(vector_in, 1, i); task->handles[2] = starpu_data_get_sub_data(vector_out, 1, j); /* all tasks in the same row are dependant so that we don't wait too much for data * we need to wait on the previous task if we are not the first task of a row */ if (index != rowptr[row & ~0x3]) { /* this is not the first task in the row */ starpu_tag_declare_deps((starpu_tag_t)taskid, 1, (starpu_tag_t)(taskid-1)); is_entry_tab[taskid] = 0; } else { /* this is an entry task */ is_entry_tab[taskid] = 1; } taskid++; } } } printf("start submitting tasks !\n"); /* submit ALL tasks now */ unsigned nchains = 0; unsigned task; for (task = 0; task < totaltasks; task++) { if (is_entry_tab[task]) { nchains++; } ret = starpu_task_submit(&task_tab[task]); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } printf("end of task submission (there was %d chains for %d tasks : ratio %d tasks per chain) !\n", nchains, totaltasks, totaltasks/nchains); }
static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { struct starpu_task *task = create_task_11(dataA, k); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { int ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } for (j = k+1; j<nblocks; j++) { create_task_21(dataA, k, j); for (i = k+1; i<nblocks; i++) { if (i <= j) create_task_22(dataA, k, i, j); } } } /* schedule the codelet */ int ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } /* stall the application until the end of computations */ starpu_tag_wait(TAG11(nblocks-1)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); end = starpu_timing_now(); double timing = end - start; unsigned n = starpu_matrix_get_nx(dataA); double flop = (1.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops\n"); PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f)); }
static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp, struct piv_s *piv_description, unsigned nblocks, starpu_data_handle_t (* get_block)(starpu_data_handle_t *, unsigned, unsigned, unsigned), double *timing) { double start; double end; int ret; /* create all the DAG nodes */ unsigned i,j,k; if (bound) starpu_bound_start(bounddeps, boundprio); start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block); if (ret == -ENODEV) return ret; for (i = 0; i < nblocks; i++) { if (i != k) { ret = create_task_pivot(dataAp, nblocks, piv_description, k, i, get_block); if (ret == -ENODEV) return ret; } } for (i = k+1; i<nblocks; i++) { ret = create_task_12(dataAp, nblocks, k, i, get_block); if (ret == -ENODEV) return ret; ret = create_task_21(dataAp, nblocks, k, i, get_block); if (ret == -ENODEV) return ret; } starpu_data_wont_use(get_block(dataAp, nblocks, k, k)); for (i = k+1; i<nblocks; i++) for (j = k+1; j<nblocks; j++) { ret = create_task_22(dataAp, nblocks, k, i, j, get_block); if (ret == -ENODEV) return ret; } for (i = k+1; i<nblocks; i++) { starpu_data_wont_use(get_block(dataAp, nblocks, k, i)); starpu_data_wont_use(get_block(dataAp, nblocks, i, k)); } } /* stall the application until the end of computations */ starpu_task_wait_for_all(); end = starpu_timing_now(); if (bound) starpu_bound_stop(); *timing = end - start; return 0; }
static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; int ret; /* create all the DAG nodes */ unsigned i,j,k; if (bound) starpu_bound_start(bounddeps, boundprio); start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { ret = create_task_11(dataA, k); if (ret == -ENODEV) return ret; for (i = k+1; i<nblocks; i++) { ret = create_task_12(dataA, k, i); if (ret == -ENODEV) return ret; ret = create_task_21(dataA, k, i); if (ret == -ENODEV) return ret; } starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, k)); for (i = k+1; i<nblocks; i++) for (j = k+1; j<nblocks; j++) { ret = create_task_22(dataA, k, i, j); if (ret == -ENODEV) return ret; } for (i = k+1; i<nblocks; i++) { starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i)); starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k)); } } /* stall the application until the end of computations */ starpu_task_wait_for_all(); end = starpu_timing_now(); if (bound) starpu_bound_stop(); double timing = end - start; unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); return 0; }
int main(int argc, char **argv) { int ret, exit_value = 0; /* Initialize StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_load_opencl_from_file("examples/axpy/axpy_opencl_kernel.cl", &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); #endif starpu_cublas_init(); /* This is equivalent to vec_a = malloc(N*sizeof(TYPE)); vec_b = malloc(N*sizeof(TYPE)); */ starpu_malloc((void **)&_vec_x, N*sizeof(TYPE)); assert(_vec_x); starpu_malloc((void **)&_vec_y, N*sizeof(TYPE)); assert(_vec_y); unsigned i; for (i = 0; i < N; i++) { _vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */ _vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */ } FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", _vec_x[0]); FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", _vec_y[0]); /* Declare the data to StarPU */ starpu_vector_data_register(&_handle_x, STARPU_MAIN_RAM, (uintptr_t)_vec_x, N, sizeof(TYPE)); starpu_vector_data_register(&_handle_y, STARPU_MAIN_RAM, (uintptr_t)_vec_y, N, sizeof(TYPE)); /* Divide the vector into blocks */ struct starpu_data_filter block_filter = { .filter_func = starpu_vector_filter_block, .nchildren = NBLOCKS }; starpu_data_partition(_handle_x, &block_filter); starpu_data_partition(_handle_y, &block_filter); double start; double end; start = starpu_timing_now(); unsigned b; for (b = 0; b < NBLOCKS; b++) { struct starpu_task *task = starpu_task_create(); task->cl = &axpy_cl; task->cl_arg = &_alpha; task->cl_arg_size = sizeof(_alpha); task->handles[0] = starpu_data_get_sub_data(_handle_x, 1, b); task->handles[1] = starpu_data_get_sub_data(_handle_y, 1, b); task->tag_id = b; ret = starpu_task_submit(task); if (ret == -ENODEV) { exit_value = 77; goto enodev; } STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); enodev: starpu_data_unpartition(_handle_x, STARPU_MAIN_RAM); starpu_data_unpartition(_handle_y, STARPU_MAIN_RAM); starpu_data_unregister(_handle_x); starpu_data_unregister(_handle_y); end = starpu_timing_now(); double timing = end - start; FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing); FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[0], _alpha); if (exit_value != 77) exit_value = check(); starpu_free((void *)_vec_x); starpu_free((void *)_vec_y); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); #endif /* Stop StarPU */ starpu_shutdown(); return exit_value; }
int main(int argc, char **argv) { int ret; assert(HEIGHT % (2*BLOCK_HEIGHT) == 0); assert(HEIGHT % FACTOR == 0); parse_args(argc, argv); /* fprintf(stderr, "Reading input file ...\n"); */ /* how many frames ? */ struct stat stbuf; stat(filename_in, &stbuf); size_t filesize = stbuf.st_size; unsigned nframes = filesize/FRAMESIZE; /* fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */ assert((filesize % sizeof(struct yuv_frame)) == 0); struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE); assert(yuv_in_buffer); /* fprintf(stderr, "Alloc output file ...\n"); */ struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE); assert(yuv_out_buffer); /* fetch input data */ FILE *f_in = fopen(filename_in, "r"); assert(f_in); /* allocate room for an output buffer */ FILE *f_out = fopen(filename_out, "w+"); assert(f_out); fread(yuv_in_buffer, FRAMESIZE, nframes, f_in); starpu_data_handle_t *frame_y_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *frame_u_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *frame_v_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_y_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_u_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_v_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* register and partition all layers */ unsigned frame; for (frame = 0; frame < nframes; frame++) { /* register Y layer */ starpu_matrix_data_register(&frame_y_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].y, WIDTH, WIDTH, HEIGHT, sizeof(uint8_t)); starpu_data_partition(frame_y_handle[frame], &filter_y); starpu_matrix_data_register(&new_frame_y_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].y, NEW_WIDTH, NEW_WIDTH, NEW_HEIGHT, sizeof(uint8_t)); starpu_data_partition(new_frame_y_handle[frame], &filter_y); /* register U layer */ starpu_matrix_data_register(&frame_u_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].u, WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(frame_u_handle[frame], &filter_uv); starpu_matrix_data_register(&new_frame_u_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].u, NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(new_frame_u_handle[frame], &filter_uv); /* register V layer */ starpu_matrix_data_register(&frame_v_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].v, WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(frame_v_handle[frame], &filter_uv); starpu_matrix_data_register(&new_frame_v_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].v, NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(new_frame_v_handle[frame], &filter_uv); } /* how many tasks are there ? */ unsigned nblocks_y = filter_y.nchildren; unsigned nblocks_uv = filter_uv.nchildren; unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes; fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes); start = starpu_timing_now(); /* do the computation */ for (frame = 0; frame < nframes; frame++) { unsigned blocky; for (blocky = 0; blocky < nblocks_y; blocky++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_y_handle[frame], 1, blocky); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_y_handle[frame], 1, blocky); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } unsigned blocku; for (blocku = 0; blocku < nblocks_uv; blocku++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_u_handle[frame], 1, blocku); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_u_handle[frame], 1, blocku); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } unsigned blockv; for (blockv = 0; blockv < nblocks_uv; blockv++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_v_handle[frame], 1, blockv); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_v_handle[frame], 1, blockv); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } } /* make sure all output buffers are sync'ed */ for (frame = 0; frame < nframes; frame++) { starpu_data_unregister(frame_y_handle[frame]); starpu_data_unregister(frame_u_handle[frame]); starpu_data_unregister(frame_v_handle[frame]); starpu_data_unregister(new_frame_y_handle[frame]); starpu_data_unregister(new_frame_u_handle[frame]); starpu_data_unregister(new_frame_v_handle[frame]); } /* There is an implicit barrier: the unregister methods will block * until the computation is done and that the result was put back into * memory. */ end = starpu_timing_now(); double timing = end - start; printf("# s\tFPS\n"); printf("%f\t%f\n", timing/1000000, (1000000*nframes)/timing); fwrite(yuv_out_buffer, NEW_FRAMESIZE, nframes, f_out); /* partition the layers into smaller parts */ starpu_shutdown(); if (fclose(f_in) != 0) fprintf(stderr, "Could not close %s properly\n", filename_in); if (fclose(f_out) != 0) fprintf(stderr, "Could not close %s properly\n", filename_out); return 0; }
int main(int argc, char **argv) { double start, end; int ret; parse_args(argc, argv); #ifdef STARPU_QUICK_CHECK niter /= 10; #endif ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); starpu_cublas_init(); init_problem_data(); partition_mult_data(); if (bound) starpu_bound_start(0, 0); start = starpu_timing_now(); unsigned x, y, iter; for (iter = 0; iter < niter; iter++) { for (x = 0; x < nslicesx; x++) for (y = 0; y < nslicesy; y++) { struct starpu_task *task = starpu_task_create(); task->cl = &cl; task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y); task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x); task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y); task->flops = 2ULL * (xdim/nslicesx) * (ydim/nslicesy) * zdim; ret = starpu_task_submit(task); if (ret == -ENODEV) { ret = 77; goto enodev; } STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); } end = starpu_timing_now(); if (bound) starpu_bound_stop(); double timing = end - start; double min, min_int; double flops = 2.0*((unsigned long long)niter)*((unsigned long long)xdim) *((unsigned long long)ydim)*((unsigned long long)zdim); if (bound) starpu_bound_compute(&min, &min_int, 1); PRINTF("# x\ty\tz\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops\tTims\tTiGFlops"); PRINTF("\n"); PRINTF("%u\t%u\t%u\t%.0f\t%.1f", xdim, ydim, zdim, timing/niter/1000.0, flops/timing/1000.0); if (bound) PRINTF("\t%.0f\t%.1f\t%.0f\t%.1f", min, flops/min/1000000.0, min_int, flops/min_int/1000000.0); PRINTF("\n"); enodev: starpu_data_unpartition(C_handle, STARPU_MAIN_RAM); starpu_data_unpartition(B_handle, STARPU_MAIN_RAM); starpu_data_unpartition(A_handle, STARPU_MAIN_RAM); starpu_data_unregister(A_handle); starpu_data_unregister(B_handle); starpu_data_unregister(C_handle); if (check) check_output(); starpu_free(A); starpu_free(B); starpu_free(C); starpu_cublas_shutdown(); starpu_shutdown(); return ret; }
int main(int argc, char **argv) { int ret; unsigned part; double timing; double start, end; unsigned row, pos; unsigned ind; /* CSR matrix description */ float *nzval; uint32_t nnz; uint32_t *colind; uint32_t *rowptr; /* Input and Output vectors */ float *vector_in_ptr; float *vector_out_ptr; /* * Parse command-line arguments */ parse_args(argc, argv); /* * Launch StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* * Create a 3-band sparse matrix as input example */ nnz = 3*size-2; starpu_malloc((void **)&nzval, nnz*sizeof(float)); starpu_malloc((void **)&colind, nnz*sizeof(uint32_t)); starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t)); assert(nzval && colind && rowptr); /* fill the matrix */ for (row = 0, pos = 0; row < size; row++) { rowptr[row] = pos; if (row > 0) { nzval[pos] = 1.0f; colind[pos] = row-1; pos++; } nzval[pos] = 5.0f; colind[pos] = row; pos++; if (row < size - 1) { nzval[pos] = 1.0f; colind[pos] = row+1; pos++; } } STARPU_ASSERT(pos == nnz); rowptr[size] = nnz; /* initiate the 2 vectors */ starpu_malloc((void **)&vector_in_ptr, size*sizeof(float)); starpu_malloc((void **)&vector_out_ptr, size*sizeof(float)); assert(vector_in_ptr && vector_out_ptr); /* fill them */ for (ind = 0; ind < size; ind++) { vector_in_ptr[ind] = 2.0f; vector_out_ptr[ind] = 0.0f; } /* * Register the CSR matrix and the 2 vectors */ starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float)); starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float)); starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float)); /* * Partition the CSR matrix and the output vector */ csr_f.nchildren = nblocks; vector_f.nchildren = nblocks; starpu_data_partition(sparse_matrix, &csr_f); starpu_data_partition(vector_out, &vector_f); /* * If we use OpenCL, we need to compile the SpMV kernel */ #ifdef STARPU_USE_OPENCL compile_spmv_opencl_kernel(); #endif start = starpu_timing_now(); /* * Create and submit StarPU tasks */ for (part = 0; part < nblocks; part++) { struct starpu_task *task = starpu_task_create(); task->cl = &spmv_cl; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = vector_in; task->handles[2] = starpu_data_get_sub_data(vector_out, 1, part); ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } starpu_task_wait_for_all(); end = starpu_timing_now(); /* * Unregister the CSR matrix and the output vector */ starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); starpu_data_unpartition(vector_out, STARPU_MAIN_RAM); /* * Unregister data */ starpu_data_unregister(sparse_matrix); starpu_data_unregister(vector_in); starpu_data_unregister(vector_out); /* * Display the result */ for (row = 0; row < STARPU_MIN(size, 16); row++) { FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]); } starpu_free(nzval); starpu_free(colind); starpu_free(rowptr); starpu_free(vector_in_ptr); starpu_free(vector_out_ptr); /* * Stop StarPU */ starpu_shutdown(); timing = end - start; FPRINTF(stderr, "Computation took (in ms)\n"); FPRINTF(stdout, "%2.2f\n", timing/1000); return 0; }
struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker) { struct starpu_task *task; int worker_id; unsigned node; /* We can't tell in advance which task will be picked up, so we measure * a timestamp, and will attribute it afterwards to the task. */ int profiling = starpu_profiling_status_get(); struct timespec pop_start_time; if (profiling) _starpu_clock_gettime(&pop_start_time); pick: /* perhaps there is some local task to be executed first */ task = _starpu_pop_local_task(worker); /* get tasks from the stacks of the strategy */ if(!task) { struct _starpu_sched_ctx *sched_ctx ; #ifndef STARPU_NON_BLOCKING_DRIVERS int been_here[STARPU_NMAX_SCHED_CTXS]; int i; for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++) been_here[i] = 0; while(!task) #endif { if(worker->nsched_ctxs == 1) sched_ctx = _starpu_get_initial_sched_ctx(); else { while(1) { sched_ctx = _get_next_sched_ctx_to_pop_into(worker); if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1) { _starpu_worker_gets_out_of_ctx(sched_ctx->id, worker); worker->removed_from_ctx[sched_ctx->id] = 0; sched_ctx = NULL; } else break; } } if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS) { if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task) { task = sched_ctx->sched_policy->pop_task(sched_ctx->id); _starpu_pop_task_end(task); } } if(!task) { /* it doesn't matter if it shares tasks list or not in the scheduler, if it does not have any task to pop just get it out of here */ /* however if it shares a task list it will be removed as soon as he finishes this job (in handle_job_termination) */ if(worker->removed_from_ctx[sched_ctx->id]) { _starpu_worker_gets_out_of_ctx(sched_ctx->id, worker); worker->removed_from_ctx[sched_ctx->id] = 0; } #ifdef STARPU_USE_SC_HYPERVISOR if(worker->pop_ctx_priority) { struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters; if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id)) { // _STARPU_TRACE_HYPERVISOR_BEGIN(); perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0); // _STARPU_TRACE_HYPERVISOR_END(); } } #endif //STARPU_USE_SC_HYPERVISOR #ifndef STARPU_NON_BLOCKING_DRIVERS if(been_here[sched_ctx->id] || worker->nsched_ctxs == 1) break; been_here[sched_ctx->id] = 1; #endif } } } if (!task) { idle_start[worker->workerid] = starpu_timing_now(); return NULL; } if(idle_start[worker->workerid] != 0.0) { double idle_end = starpu_timing_now(); idle[worker->workerid] += (idle_end - idle_start[worker->workerid]); idle_start[worker->workerid] = 0.0; } #ifdef STARPU_USE_SC_HYPERVISOR struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx); struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters; if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_poped_task && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id)) { // _STARPU_TRACE_HYPERVISOR_BEGIN(); perf_counters->notify_poped_task(task->sched_ctx, worker->workerid); // _STARPU_TRACE_HYPERVISOR_END(); } #endif //STARPU_USE_SC_HYPERVISOR /* Make sure we do not bother with all the multiformat-specific code if * it is not necessary. */ if (!_starpu_task_uses_multiformat_handles(task)) goto profiling; /* This is either a conversion task, or a regular task for which the * conversion tasks have already been created and submitted */ if (task->mf_skip) goto profiling; /* * This worker may not be able to execute this task. In this case, we * should return the task anyway. It will be pushed back almost immediatly. * This way, we avoid computing and executing the conversions tasks. * Here, we do not care about what implementation is used. */ worker_id = starpu_worker_get_id(); if (!starpu_worker_can_execute_task_first_impl(worker_id, task, NULL)) return task; node = starpu_worker_get_memory_node(worker_id); /* * We do have a task that uses multiformat handles. Let's create the * required conversion tasks. */ STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex); unsigned i; unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task); for (i = 0; i < nbuffers; i++) { struct starpu_task *conversion_task; starpu_data_handle_t handle; handle = STARPU_TASK_GET_HANDLE(task, i); if (!_starpu_handle_needs_conversion_task(handle, node)) continue; conversion_task = _starpu_create_conversion_task(handle, node); conversion_task->mf_skip = 1; conversion_task->execute_on_a_specific_worker = 1; conversion_task->workerid = worker_id; /* * Next tasks will need to know where these handles have gone. */ handle->mf_node = node; _starpu_task_submit_conversion_task(conversion_task, worker_id); } task->mf_skip = 1; starpu_task_list_push_back(&worker->local_tasks, task); STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex); goto pick; profiling: if (profiling) { struct starpu_profiling_task_info *profiling_info; profiling_info = task->profiling_info; /* The task may have been created before profiling was enabled, * so we check if the profiling_info structure is available * even though we already tested if profiling is enabled. */ if (profiling_info) { memcpy(&profiling_info->pop_start_time, &pop_start_time, sizeof(struct timespec)); _starpu_clock_gettime(&profiling_info->pop_end_time); } } if(task->prologue_callback_pop_func) task->prologue_callback_pop_func(task->prologue_callback_pop_arg); return task; }