static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel) { int ret; struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel)); task->cl = &cl21; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); if (j == k+1) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel)); } else { starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_STRSM(n, n); ret = starpu_task_submit(task); if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return ret; }
static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k) { /* FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */ struct starpu_task *task = create_task(TAG11(k)); task->cl = &cl11; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k); /* this is an important task */ if (!noprio) task->priority = STARPU_MAX_PRIO; /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SPOTRF(n); return task; }
int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t)); /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_matrix_data_register(&dataAp[bi+nblocks*bj], STARPU_MAIN_RAM, (uintptr_t)matA[bi+nblocks*bj], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; int ret = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding, &timing); if (ret) return ret; unsigned n = starpu_matrix_get_nx(dataAp[0])*nblocks; double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_data_unregister(dataAp[bi+nblocks*bj]); } free(dataAp); return ret; }
double gemm_cost(starpu_buffer_descr *descr) { /* C = A * B */ uint32_t nxC, nyC, nxA; nxC = starpu_matrix_get_nx(descr[2].handle); nyC = starpu_matrix_get_ny(descr[2].handle); nxA = starpu_matrix_get_nx(descr[0].handle); // printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f); // printf("cost %e \n", cost); return cost; }
void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f; f.filter_func = starpu_vertical_block_filter_func; f.filter_arg = nblocks; struct starpu_data_filter f2; f2.filter_func = starpu_block_filter_func; f2.filter_arg = nblocks; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } #if 0 unsigned j; for (j = 0; j < nblocks; j++) for (i = 0; i < nblocks; i++) { printf("BLOCK %d %d %p\n", i, j, &matA[i*(size/nblocks) + j * (size/nblocks)*ld]); } #endif double timing; timing = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); /* gather all the data */ starpu_data_unpartition(dataA, 0); }
double task_11_cost(struct starpu_task *task, unsigned nimpl) { uint32_t n; n = starpu_matrix_get_nx(task->handles[0]); double cost = ((n*n*n)/537.5); return PERTURBATE(cost); }
double task_11_cost(starpu_buffer_descr *descr) { uint32_t n; n = starpu_matrix_get_nx(descr[0].handle); double cost = ((n*n*n)/537.5); return PERTURBATE(cost); }
double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl) { uint32_t n; n = starpu_matrix_get_nx(task->handles[0]); double cost = ((n*n*n)/42838.5718); /* printf("CUDA task 12 ; predict %e\n", cost); */ return PERTURBATE(cost); }
double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl) { uint32_t n; n = starpu_matrix_get_nx(task->handles[0]); double cost = ((n*n*n)/6793.8423); /* printf("CPU task 21 ; predict %e\n", cost); */ return PERTURBATE(cost); }
double task_21_cost_cpu(starpu_buffer_descr *descr) { uint32_t n; n = starpu_matrix_get_nx(descr[0].handle); double cost = ((n*n*n)/6793.8423); // printf("CPU task 21 ; predict %e\n", cost); return PERTURBATE(cost); }
double task_21_cost_cuda(starpu_buffer_descr *descr) { uint32_t n; n = starpu_matrix_get_nx(descr[0].handle); double cost = ((n*n*n)/49208.667); // printf("CUDA task 21 ; predict %e\n", cost); return PERTURBATE(cost); }
double task_21_cost(struct starpu_task *task, unsigned nimpl) { uint32_t n; n = starpu_matrix_get_nx(task->handles[0]); /* double cost = ((n*n*n)/1744.695); */ double cost = ((n*n*n)/3691.53); /* fprintf(stderr, "task 12 predicts %e\n", cost); */ return PERTURBATE(cost); }
double task_22_cost(struct starpu_task *task, unsigned nimpl) { uint32_t nx, ny, nz; nx = starpu_matrix_get_nx(task->handles[2]); ny = starpu_matrix_get_ny(task->handles[2]); nz = starpu_matrix_get_ny(task->handles[0]); double cost = ((nx*ny*nz)/4110.0); return PERTURBATE(cost); }
double task_21_cost(starpu_buffer_descr *descr) { uint32_t n; n = starpu_matrix_get_nx(descr[0].handle); // double cost = ((n*n*n)/1744.695); double cost = ((n*n*n)/3691.53); //fprintf(stderr, "task 12 predicts %e\n", cost); return PERTURBATE(cost); }
double task_22_cost(starpu_buffer_descr *descr) { uint32_t nx, ny, nz; nx = starpu_matrix_get_nx(descr[2].handle); ny = starpu_matrix_get_ny(descr[2].handle); nz = starpu_matrix_get_ny(descr[0].handle); double cost = ((nx*ny*nz)/4110.0); return PERTURBATE(cost); }
double task_22_cost_cpu(starpu_buffer_descr *descr) { uint32_t nx, ny, nz; nx = starpu_matrix_get_nx(descr[2].handle); ny = starpu_matrix_get_ny(descr[2].handle); nz = starpu_matrix_get_ny(descr[0].handle); double cost = ((nx*ny*nz)/4203.0175); // printf("CPU task 22 ; predict %e\n", cost); return PERTURBATE(cost); }
double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl) { uint32_t nx, ny, nz; nx = starpu_matrix_get_nx(task->handles[2]); ny = starpu_matrix_get_ny(task->handles[2]); nz = starpu_matrix_get_ny(task->handles[0]); double cost = ((nx*ny*nz)/4203.0175); /* printf("CPU task 22 ; predict %e\n", cost); */ return PERTURBATE(cost); }
static double cpu_chol_task_22_cost(starpu_buffer_descr *descr) { uint32_t n; n = starpu_matrix_get_nx(descr[0].handle); double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760); #ifdef STARPU_MODEL_DEBUG printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost); #endif return PERTURBATE(cost); }
void STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle)); /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_matrix_data_register(&dataAp[bi+nblocks*bj], 0, (uintptr_t)matA[bi+nblocks*bj], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataAp[bi+nblocks*bj], 0); } unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; timing = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataAp[0])*nblocks; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_data_unregister(dataAp[bi+nblocks*bj]); } }
static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j) { /* FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */ struct starpu_task *task = create_task(TAG22(k, i, j)); task->cl = &cl22; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j); if (!noprio && (i == k + 1) && (j == k +1) ) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j)); } else { starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SGEMM(n, n, n); int ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } }
static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel) { int ret; /* FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */ struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel)); task->cl = &cl22; /* which sub-data is manipulated ? */ task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i); task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j); task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j); if ( (i == k + 1) && (j == k +1) ) { task->priority = STARPU_MAX_PRIO; } /* enforce dependencies ... */ if (k > 0) { starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel)); } else { starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel)); } int n = starpu_matrix_get_nx(task->handles[0]); task->flops = FLOPS_SGEMM(n, n, n); ret = starpu_task_submit(task); if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); return ret; }
static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { struct starpu_task *task = create_task_11(dataA, k); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { int ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } for (j = k+1; j<nblocks; j++) { create_task_21(dataA, k, j); for (i = k+1; i<nblocks; i++) { if (i <= j) create_task_22(dataA, k, i, j); } } } /* schedule the codelet */ int ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } /* stall the application until the end of computations */ starpu_tag_wait(TAG11(nblocks-1)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); end = starpu_timing_now(); double timing = end - start; unsigned n = starpu_matrix_get_nx(dataA); double flop = (1.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops\n"); PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f)); }
static double sgemm_cuda_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl) { uint32_t n = starpu_matrix_get_nx(task->handles[0]); double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666); return cost; }
int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing); if (ret) return ret; unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); /* gather all the data */ starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); free(piv_description); return ret; }
static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks) { double start; double end; int ret; /* create all the DAG nodes */ unsigned i,j,k; if (bound) starpu_bound_start(bounddeps, boundprio); start = starpu_timing_now(); for (k = 0; k < nblocks; k++) { ret = create_task_11(dataA, k); if (ret == -ENODEV) return ret; for (i = k+1; i<nblocks; i++) { ret = create_task_12(dataA, k, i); if (ret == -ENODEV) return ret; ret = create_task_21(dataA, k, i); if (ret == -ENODEV) return ret; } starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, k)); for (i = k+1; i<nblocks; i++) for (j = k+1; j<nblocks; j++) { ret = create_task_22(dataA, k, i, j); if (ret == -ENODEV) return ret; } for (i = k+1; i<nblocks; i++) { starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i)); starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k)); } } /* stall the application until the end of computations */ starpu_task_wait_for_all(); end = starpu_timing_now(); if (bound) starpu_bound_stop(); double timing = end - start; unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; PRINTF("# size\tms\tGFlops"); if (bound) PRINTF("\tTms\tTGFlops"); PRINTF("\n"); PRINTF("%u\t%.0f\t%.1f", n, timing/1000, flop/timing/1000.0f); if (bound) { double min; starpu_bound_compute(&min, NULL, 0); PRINTF("\t%.0f\t%.1f", min, flop/min/1000000.0f); } PRINTF("\n"); return 0; }