static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); _cholesky(dataA, nblocks); starpu_data_unregister(dataA); }
void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f; f.filter_func = starpu_vertical_block_filter_func; f.filter_arg = nblocks; struct starpu_data_filter f2; f2.filter_func = starpu_block_filter_func; f2.filter_arg = nblocks; starpu_data_map_filters(dataA, 2, &f, &f2); unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } #if 0 unsigned j; for (j = 0; j < nblocks; j++) for (i = 0; i < nblocks; i++) { printf("BLOCK %d %d %p\n", i, j, &matA[i*(size/nblocks) + j * (size/nblocks)*ld]); } #endif double timing; timing = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataA); double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); /* gather all the data */ starpu_data_unpartition(dataA, 0); }
void STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks) { starpu_data_handle *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle)); /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_matrix_data_register(&dataAp[bi+nblocks*bj], 0, (uintptr_t)matA[bi+nblocks*bj], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); /* We already enforce deps by hand */ starpu_data_set_sequential_consistency_flag(dataAp[bi+nblocks*bj], 0); } unsigned i; for (i = 0; i < size; i++) ipiv[i] = i; struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s)); unsigned block; for (block = 0; block < nblocks; block++) { piv_description[block].piv = ipiv; piv_description[block].first = block * (size / nblocks); piv_description[block].last = (block + 1) * (size / nblocks); } double timing; timing = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding); fprintf(stderr, "Computation took (in ms)\n"); fprintf(stderr, "%2.2f\n", timing/1000); unsigned n = starpu_matrix_get_nx(dataAp[0])*nblocks; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f)); for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { starpu_data_unregister(dataAp[bi+nblocks*bj]); } }
void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag) { _starpu_spin_lock(&handle->header_lock); unsigned child; for (child = 0; child < handle->nchildren; child++) { /* make sure that the flags are applied to the children as well */ struct starpu_data_state_t *child_handle = &handle->children[child]; if (child_handle->nchildren > 0) starpu_data_set_sequential_consistency_flag(child_handle, flag); } PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex); handle->sequential_consistency = flag; PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex); _starpu_spin_unlock(&handle->header_lock); }
static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel) { int ret; /* create a new codelet */ struct starpu_task *entry_task = NULL; /* create all the DAG nodes */ unsigned i,j,k; starpu_data_handle_t dataA; /* monitor and partition the A matrix into blocks : * one block is now determined by 2 unsigned (i,j) */ starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float)); starpu_data_set_sequential_consistency_flag(dataA, 0); struct starpu_data_filter f = { .filter_func = starpu_matrix_filter_vertical_block, .nchildren = nblocks }; struct starpu_data_filter f2 = { .filter_func = starpu_matrix_filter_block, .nchildren = nblocks }; starpu_data_map_filters(dataA, 2, &f, &f2); for (k = 0; k < nbigblocks; k++) { struct starpu_task *task = create_task_11(dataA, k, reclevel); /* we defer the launch of the first task */ if (k == 0) { entry_task = task; } else { ret = starpu_task_submit(task); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } for (j = k+1; j<nblocks; j++) { ret = create_task_21(dataA, k, j, reclevel); if (ret == -ENODEV) return 77; for (i = k+1; i<nblocks; i++) { if (i <= j) { ret = create_task_22(dataA, k, i, j, reclevel); if (ret == -ENODEV) return 77; } } } } /* schedule the codelet */ ret = starpu_task_submit(entry_task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); return 77; } if (nblocks == nbigblocks) { /* stall the application until the end of computations */ starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel)); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); return 0; } else { STARPU_ASSERT(reclevel == 0); unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks); starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t)); STARPU_ASSERT(tag_array); unsigned ind = 0; for (i = nbigblocks; i < nblocks; i++) for (j = nbigblocks; j < nblocks; j++) { if (i <= j) tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel); } starpu_tag_wait_array(ind, tag_array); free(tag_array); starpu_data_unpartition(dataA, STARPU_MAIN_RAM); starpu_data_unregister(dataA); float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)]; return cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1); } }