static void partition_mult_data(void) { gettimeofday(&start, NULL); starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, ydim, ydim, zdim, sizeof(float)); starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B, zdim, zdim, xdim, sizeof(float)); starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, ydim, ydim, xdim, sizeof(float)); starpu_data_set_wt_mask(C_handle, 1<<0); conf.k = zdim; conf.m = ydim/nslicesy; conf.n = xdim/nslicesx; struct starpu_data_filter f; f.filter_func = starpu_vertical_block_filter_func; f.nchildren = nslicesx; f.get_nchildren = NULL; f.get_child_ops = NULL; struct starpu_data_filter f2; f2.filter_func = starpu_block_filter_func; f2.nchildren = nslicesy; f2.get_nchildren = NULL; f2.get_child_ops = NULL; starpu_data_partition(B_handle, &f); starpu_data_partition(A_handle, &f2); starpu_data_map_filters(C_handle, 2, &f, &f2); }
static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); _cholesky(dataA, nblocks); starpu_data_unregister(dataA); }
int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); int ret = dw_codelet_facto_v3(dataA, nblocks); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); return ret; }
void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f; f.filter_func = starpu_vertical_block_filter_func; f.filter_arg = nblocks; struct starpu_data_filter f2; f2.filter_func = starpu_block_filter_func; f2.filter_arg = nblocks; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } #if 0 unsigned j; for (j = 0; j < nblocks; j++) for (i = 0; i < nblocks; i++) { printf("BLOCK %d %d %p\n", i, j, &matA[i*(size/nblocks) + j * (size/nblocks)*ld]); } #endif double timing; timing = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); /* gather all the data */ starpu_data_unpartition(dataA, 0); }
int main(int argc, char **argv) { int ret; starpu_init(NULL); starpu_data_malloc_pinned_if_possible((void **)&buffer, VECTORSIZE); starpu_vector_data_register(&v_handle, 0, (uintptr_t)buffer, VECTORSIZE, sizeof(char)); struct starpu_data_filter f = { .filter_func = starpu_vector_divide_in_2_filter_func, /* there are only 2 children */ .nchildren = 2, /* the length of the first part */ .filter_arg = VECTORSIZE/2, .get_nchildren = NULL, .get_child_ops = NULL }; unsigned iter; for (iter = 0; iter < NITER; iter++) { starpu_data_map_filters(v_handle, 1, &f); ret = use_handle(starpu_data_get_sub_data(v_handle, 1, 0)); if (ret == -ENODEV) goto enodev; ret = use_handle(starpu_data_get_sub_data(v_handle, 1, 1)); if (ret == -ENODEV) goto enodev; starpu_task_wait_for_all(); starpu_data_unpartition(v_handle, 0); ret = use_handle(v_handle); if (ret == -ENODEV) goto enodev; starpu_task_wait_for_all(); } starpu_data_unregister(v_handle); starpu_shutdown(); return 0; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ return 0; }
void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio) { #ifdef CHECK_RESULTS FPRINTF(stderr, "Checking results ...\n"); float *Asaved; Asaved = malloc((size_t)ld*ld*sizeof(float)); memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float)); #endif no_prio = _no_prio; starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); dw_codelet_facto_v3(dataA, nblocks); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); #ifdef CHECK_RESULTS compare_A_LU(Asaved, matA, size, ld); #endif }
static void partition_mult_data(void) { starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A, ydim, ydim, zdim, sizeof(TYPE)); starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B, zdim, zdim, xdim, sizeof(TYPE)); starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C, ydim, ydim, xdim, sizeof(TYPE)); struct starpu_data_filter vert; memset(&vert, 0, sizeof(vert)); vert.filter_func = starpu_matrix_filter_vertical_block; vert.nchildren = nslicesx; struct starpu_data_filter horiz; memset(&horiz, 0, sizeof(horiz)); horiz.filter_func = starpu_matrix_filter_block; horiz.nchildren = nslicesy; starpu_data_partition(B_handle, &vert); starpu_data_partition(A_handle, &horiz); starpu_data_map_filters(C_handle, 2, &vert, &horiz); }
int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing); if (ret) return ret; unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); free(piv_description); return ret; }
static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel) { int ret; /* create a new codelet */ struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); for (k = 0; k < nbigblocks; k++) { struct starpu_task *task = create_task_11(dataA, k, reclevel); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { ret = starpu_task_submit(task); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } for (j = k+1; j<nblocks; j++) { ret = create_task_21(dataA, k, j, reclevel); if (ret == -ENODEV) return 77; for (i = k+1; i<nblocks; i++) { if (i <= j) { ret = create_task_22(dataA, k, i, j, reclevel); if (ret == -ENODEV) return 77; } } } } /* schedule the codelet */ ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); return 77; } if (nblocks == nbigblocks) { /* stall the application until the end of computations */ starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); return 0; } else { STARPU_ASSERT(reclevel == 0); unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks); starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t)); STARPU_ASSERT(tag_array); unsigned ind = 0; for (i = nbigblocks; i < nblocks; i++) for (j = nbigblocks; j < nblocks; j++) { if (i <= j) tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel); } starpu_tag_wait_array(ind, tag_array); free(tag_array); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)]; return cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1); } }