static void partition_mult_data(void) { gettimeofday(&start, NULL); starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, ydim, ydim, zdim, sizeof(float)); starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B, zdim, zdim, xdim, sizeof(float)); starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, ydim, ydim, xdim, sizeof(float)); starpu_data_set_wt_mask(C_handle, 1<<0); conf.k = zdim; conf.m = ydim/nslicesy; conf.n = xdim/nslicesx; struct starpu_data_filter f; f.filter_func = starpu_vertical_block_filter_func; f.nchildren = nslicesx; f.get_nchildren = NULL; f.get_child_ops = NULL; struct starpu_data_filter f2; f2.filter_func = starpu_block_filter_func; f2.nchildren = nslicesy; f2.get_nchildren = NULL; f2.get_child_ops = NULL; starpu_data_partition(B_handle, &f); starpu_data_partition(A_handle, &f2); starpu_data_map_filters(C_handle, 2, &f, &f2); }
static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); _cholesky(dataA, nblocks); starpu_data_unregister(dataA); }
int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); int ret = dw_codelet_facto_v3(dataA, nblocks); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); return ret; }
int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t)); /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_matrix_data_register(&dataAp[bi+nblocks*bj], STARPU_MAIN_RAM, (uintptr_t)matA[bi+nblocks*bj], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; int ret = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding, &timing); if (ret) return ret; unsigned n = starpu_matrix_get_nx(dataAp[0])*nblocks; double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_data_unregister(dataAp[bi+nblocks*bj]); } free(dataAp); return ret; }
void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f; f.filter_func = starpu_vertical_block_filter_func; f.filter_arg = nblocks; struct starpu_data_filter f2; f2.filter_func = starpu_block_filter_func; f2.filter_arg = nblocks; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } #if 0 unsigned j; for (j = 0; j < nblocks; j++) for (i = 0; i < nblocks; i++) { printf("BLOCK %d %d %p\n", i, j, &matA[i*(size/nblocks) + j * (size/nblocks)*ld]); } #endif double timing; timing = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); /* gather all the data */ starpu_data_unpartition(dataA, 0); }
static void partition_mult_data(void) { starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, ydim, ydim, zdim, sizeof(TYPE)); starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, zdim, zdim, xdim, sizeof(TYPE)); starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, ydim, ydim, xdim, sizeof(TYPE)); struct starpu_data_filter vert; memset(&vert, 0, sizeof(vert)); vert.filter_func = starpu_matrix_filter_vertical_block; vert.nchildren = nslicesx; struct starpu_data_filter horiz; memset(&horiz, 0, sizeof(horiz)); horiz.filter_func = starpu_matrix_filter_block; horiz.nchildren = nslicesy; starpu_data_partition(B_handle, &vert); starpu_data_partition(A_handle, &horiz); starpu_data_map_filters(C_handle, 2, &vert, &horiz); }
void STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle)); /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_matrix_data_register(&dataAp[bi+nblocks*bj], 0, (uintptr_t)matA[bi+nblocks*bj], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataAp[bi+nblocks*bj], 0); } unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; timing = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataAp[0])*nblocks; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_data_unregister(dataAp[bi+nblocks*bj]); } }
void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio) { #ifdef CHECK_RESULTS FPRINTF(stderr, "Checking results ...\n"); float *Asaved; Asaved = malloc((size_t)ld*ld*sizeof(float)); memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float)); #endif no_prio = _no_prio; starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); dw_codelet_facto_v3(dataA, nblocks); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); #ifdef CHECK_RESULTS compare_A_LU(Asaved, matA, size, ld); #endif }
int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing); if (ret) return ret; unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); free(piv_description); return ret; }
static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel) { int ret; /* create a new codelet */ struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); for (k = 0; k < nbigblocks; k++) { struct starpu_task *task = create_task_11(dataA, k, reclevel); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { ret = starpu_task_submit(task); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } for (j = k+1; j<nblocks; j++) { ret = create_task_21(dataA, k, j, reclevel); if (ret == -ENODEV) return 77; for (i = k+1; i<nblocks; i++) { if (i <= j) { ret = create_task_22(dataA, k, i, j, reclevel); if (ret == -ENODEV) return 77; } } } } /* schedule the codelet */ ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); return 77; } if (nblocks == nbigblocks) { /* stall the application until the end of computations */ starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); return 0; } else { STARPU_ASSERT(reclevel == 0); unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks); starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t)); STARPU_ASSERT(tag_array); unsigned ind = 0; for (i = nbigblocks; i < nblocks; i++) for (j = nbigblocks; j < nblocks; j++) { if (i <= j) tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel); } starpu_tag_wait_array(ind, tag_array); free(tag_array); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)]; return cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1); } }
static void init_matrix(int rank) { #ifdef STARPU_HAVE_LIBNUMA if (numa) { fprintf(stderr, "Using INTERLEAVE policy\n"); unsigned long nodemask = ((1<<0)|(1<<1)); int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3); if (ret) perror("set_mempolicy failed"); } #endif /* Allocate a grid of data handles, not all of them have to be allocated later on */ dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t)); dataA = calloc(nblocks*nblocks, sizeof(TYPE *)); allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE); /* Allocate all the blocks that belong to this mpi node */ unsigned long i,j; for (j = 0; j < nblocks; j++) { for (i = 0; i < nblocks; i++) { TYPE **blockptr = &dataA[j+i*nblocks]; // starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i]; starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i]; if (get_block_rank(i, j) == rank) { /* This blocks should be treated by the current MPI process */ /* Allocate and fill it */ starpu_malloc((void **)blockptr, blocksize); allocated_memory += blocksize; //fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j); fill_block_with_random(*blockptr, size, nblocks); //fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j); if (i == j) { unsigned tmp; for (tmp = 0; tmp < size/nblocks; tmp++) { (*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks; } } /* Register it to StarPU */ starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*blockptr, size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } else { *blockptr = STARPU_POISON_PTR; *handleptr = STARPU_POISON_PTR; } } } /* Allocate the temporary buffers required for the distributed algorithm */ unsigned k; /* tmp buffer 11 */ #ifdef SINGLE_TMP11 starpu_malloc((void **)&tmp_11_block, blocksize); allocated_memory_extra += blocksize; starpu_matrix_data_register(&tmp_11_block_handle, STARPU_MAIN_RAM, (uintptr_t)tmp_11_block, size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); #else tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_11_block = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); for (k = 0; k < nblocks; k++) { if (tmp_11_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_11_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_11_block[k]); starpu_matrix_data_register(&tmp_11_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_11_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } } #endif /* tmp buffers 12 and 21 */ #ifdef SINGLE_TMP1221 tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_12_block = calloc(nblocks, sizeof(TYPE *)); tmp_21_block = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); #else for (i = 0; i < 2; i++) { tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *)); tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); } #endif for (k = 0; k < nblocks; k++) { #ifdef SINGLE_TMP1221 if (tmp_12_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_12_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_12_block[k]); starpu_matrix_data_register(&tmp_12_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_12_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } if (tmp_21_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_21_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_21_block[k]); starpu_matrix_data_register(&tmp_21_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_21_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } #else for (i = 0; i < 2; i++) { if (tmp_12_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_12_block[i][k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_12_block[i][k]); starpu_matrix_data_register(&tmp_12_block_handles[i][k], STARPU_MAIN_RAM, (uintptr_t)tmp_12_block[i][k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } if (tmp_21_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_21_block[i][k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_21_block[i][k]); starpu_matrix_data_register(&tmp_21_block_handles[i][k], STARPU_MAIN_RAM, (uintptr_t)tmp_21_block[i][k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } } #endif } //display_all_blocks(nblocks, size/nblocks); }
int main(int argc, char **argv) { int ret; assert(HEIGHT % (2*BLOCK_HEIGHT) == 0); assert(HEIGHT % FACTOR == 0); parse_args(argc, argv); /* fprintf(stderr, "Reading input file ...\n"); */ /* how many frames ? */ struct stat stbuf; stat(filename_in, &stbuf); size_t filesize = stbuf.st_size; unsigned nframes = filesize/FRAMESIZE; /* fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */ assert((filesize % sizeof(struct yuv_frame)) == 0); struct yuv_frame *yuv_in_buffer = (struct yuv_frame *) malloc(nframes*FRAMESIZE); assert(yuv_in_buffer); /* fprintf(stderr, "Alloc output file ...\n"); */ struct yuv_new_frame *yuv_out_buffer = (struct yuv_new_frame *) calloc(nframes, NEW_FRAMESIZE); assert(yuv_out_buffer); /* fetch input data */ FILE *f_in = fopen(filename_in, "r"); assert(f_in); /* allocate room for an output buffer */ FILE *f_out = fopen(filename_out, "w+"); assert(f_out); fread(yuv_in_buffer, FRAMESIZE, nframes, f_in); starpu_data_handle_t *frame_y_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *frame_u_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *frame_v_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_y_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_u_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); starpu_data_handle_t *new_frame_v_handle = (starpu_data_handle_t *) calloc(nframes, sizeof(starpu_data_handle_t)); ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* register and partition all layers */ unsigned frame; for (frame = 0; frame < nframes; frame++) { /* register Y layer */ starpu_matrix_data_register(&frame_y_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].y, WIDTH, WIDTH, HEIGHT, sizeof(uint8_t)); starpu_data_partition(frame_y_handle[frame], &filter_y); starpu_matrix_data_register(&new_frame_y_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].y, NEW_WIDTH, NEW_WIDTH, NEW_HEIGHT, sizeof(uint8_t)); starpu_data_partition(new_frame_y_handle[frame], &filter_y); /* register U layer */ starpu_matrix_data_register(&frame_u_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].u, WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(frame_u_handle[frame], &filter_uv); starpu_matrix_data_register(&new_frame_u_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].u, NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(new_frame_u_handle[frame], &filter_uv); /* register V layer */ starpu_matrix_data_register(&frame_v_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_in_buffer[frame].v, WIDTH/2, WIDTH/2, HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(frame_v_handle[frame], &filter_uv); starpu_matrix_data_register(&new_frame_v_handle[frame], STARPU_MAIN_RAM, (uintptr_t)&yuv_out_buffer[frame].v, NEW_WIDTH/2, NEW_WIDTH/2, NEW_HEIGHT/2, sizeof(uint8_t)); starpu_data_partition(new_frame_v_handle[frame], &filter_uv); } /* how many tasks are there ? */ unsigned nblocks_y = filter_y.nchildren; unsigned nblocks_uv = filter_uv.nchildren; unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes; fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes); start = starpu_timing_now(); /* do the computation */ for (frame = 0; frame < nframes; frame++) { unsigned blocky; for (blocky = 0; blocky < nblocks_y; blocky++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_y_handle[frame], 1, blocky); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_y_handle[frame], 1, blocky); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } unsigned blocku; for (blocku = 0; blocku < nblocks_uv; blocku++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_u_handle[frame], 1, blocku); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_u_handle[frame], 1, blocku); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } unsigned blockv; for (blockv = 0; blockv < nblocks_uv; blockv++) { struct starpu_task *task = starpu_task_create(); task->cl = &ds_codelet; /* input */ task->handles[0] = starpu_data_get_sub_data(frame_v_handle[frame], 1, blockv); /* output */ task->handles[1] = starpu_data_get_sub_data(new_frame_v_handle[frame], 1, blockv); ret = starpu_task_submit(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } } /* make sure all output buffers are sync'ed */ for (frame = 0; frame < nframes; frame++) { starpu_data_unregister(frame_y_handle[frame]); starpu_data_unregister(frame_u_handle[frame]); starpu_data_unregister(frame_v_handle[frame]); starpu_data_unregister(new_frame_y_handle[frame]); starpu_data_unregister(new_frame_u_handle[frame]); starpu_data_unregister(new_frame_v_handle[frame]); } /* There is an implicit barrier: the unregister methods will block * until the computation is done and that the result was put back into * memory. */ end = starpu_timing_now(); double timing = end - start; printf("# s\tFPS\n"); printf("%f\t%f\n", timing/1000000, (1000000*nframes)/timing); fwrite(yuv_out_buffer, NEW_FRAMESIZE, nframes, f_out); /* partition the layers into smaller parts */ starpu_shutdown(); if (fclose(f_in) != 0) fprintf(stderr, "Could not close %s properly\n", filename_in); if (fclose(f_out) != 0) fprintf(stderr, "Could not close %s properly\n", filename_out); return 0; }