TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks) { // fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks); TYPE *bigmatrix = calloc(size*size, sizeof(TYPE)); unsigned block_size = size/nblocks; int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { TYPE *block; int block_rank = get_block_rank(bi, bj); if (block_rank == 0) { block = STARPU_PLU(get_block)(bi, bj); } else { MPI_Status status; if (rank == 0) { block = calloc(block_size*block_size, sizeof(TYPE)); int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status); STARPU_ASSERT(ret == MPI_SUCCESS); } else if (rank == block_rank) { block = STARPU_PLU(get_block)(bi, bj); int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD); STARPU_ASSERT(ret == MPI_SUCCESS); } } if (rank == 0) { unsigned j, i; for (j = 0; j < block_size; j++) for (i = 0; i < block_size; i++) { bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] = block[j+i*block_size]; } if (get_block_rank(bi, bj) != 0) free(block); } } return bigmatrix; }
static unsigned tmp_21_block_is_needed(int rank, unsigned pnblocks, unsigned i) { unsigned j; for (j = 1; j < pnblocks; j++) { if (get_block_rank(i, j) == rank) return 1; } return 0; }
static unsigned tmp_12_block_is_needed(int rank, unsigned pnblocks, unsigned j) { unsigned i; for (i = 1; i < pnblocks; i++) { if (get_block_rank(i, j) == rank) return 1; } return 0; }
/* x and y must be valid (at least) on 0 */ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank) { unsigned block_size = size/nblocks; /* Send x to everyone */ int bcst_ret; bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD); STARPU_ASSERT(bcst_ret == MPI_SUCCESS); /* Create temporary buffers where all MPI processes are going to * compute Ai x = yi where Ai is the matrix containing the blocks of A * affected to process i, and 0 everywhere else. We then have y as the * sum of all yi. */ TYPE *yi = calloc(size, sizeof(TYPE)); /* Compute Aix = yi */ unsigned long i,j; for (j = 0; j < nblocks; j++) { for (i = 0; i < nblocks; i++) { if (get_block_rank(i, j) == rank) { /* That block belongs to the current MPI process */ TYPE *block_data = STARPU_PLU(get_block)(i, j); TYPE *sub_x = &x[i*block_size]; TYPE *sub_yi = &yi[j*block_size]; STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi); } } } /* Compute the Sum of all yi = y */ MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]); free(yi); }
static void display_grid(int rank, unsigned pnblocks) { if (!display) return; //if (rank == 0) { fprintf(stderr, "2D grid layout (Rank %d): \n", rank); unsigned i, j; for (j = 0; j < pnblocks; j++) { for (i = 0; i < pnblocks; i++) { TYPE *blockptr = STARPU_PLU(get_block)(i, j); starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j); fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle); } fprintf(stderr, "\n"); } } }
void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank) { /* Create temporary buffers where all MPI processes are going to * compute Ui x = yi where Ai is the matrix containing the blocks of U * affected to process i, and 0 everywhere else. We then have y as the * sum of all yi. */ TYPE *yi = calloc(size, sizeof(TYPE)); fprintf(stderr, "Compute LU\n"); unsigned block_size = size/nblocks; /* Compute UiX = Yi */ unsigned long i,j; for (j = 0; j < nblocks; j++) { if (get_block_rank(j, j) == rank) { TYPE *block_data = STARPU_PLU(get_block)(j, j); TYPE *sub_x = &x[j*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi); } for (i = j + 1; i < nblocks; i++) { if (get_block_rank(i, j) == rank) { /* That block belongs to the current MPI process */ TYPE *block_data = STARPU_PLU(get_block)(i, j); TYPE *sub_x = &x[i*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi); } } } /* Grab Sum Yi in X */ MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); memset(yi, 0, size*sizeof(TYPE)); unsigned ind; // if (rank == 0) // { // fprintf(stderr, "INTERMEDIATE\n"); // for (ind = 0; ind < STARPU_MIN(10, size); ind++) // { // fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]); // } // fprintf(stderr, "****\n"); // } /* Everyone needs x */ int bcst_ret; bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD); STARPU_ASSERT(bcst_ret == MPI_SUCCESS); /* Compute LiX = Yi (with X = UX) */ for (j = 0; j < nblocks; j++) { if (j > 0) for (i = 0; i < j; i++) { if (get_block_rank(i, j) == rank) { /* That block belongs to the current MPI process */ TYPE *block_data = STARPU_PLU(get_block)(i, j); TYPE *sub_x = &x[i*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi); } } if (get_block_rank(j, j) == rank) { TYPE *block_data = STARPU_PLU(get_block)(j, j); TYPE *sub_x = &x[j*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi); } } /* Grab Sum Yi in Y */ MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); free(yi); }
static void init_matrix(int rank) { #ifdef STARPU_HAVE_LIBNUMA if (numa) { fprintf(stderr, "Using INTERLEAVE policy\n"); unsigned long nodemask = ((1<<0)|(1<<1)); int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3); if (ret) perror("set_mempolicy failed"); } #endif /* Allocate a grid of data handles, not all of them have to be allocated later on */ dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t)); dataA = calloc(nblocks*nblocks, sizeof(TYPE *)); allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE); /* Allocate all the blocks that belong to this mpi node */ unsigned long i,j; for (j = 0; j < nblocks; j++) { for (i = 0; i < nblocks; i++) { TYPE **blockptr = &dataA[j+i*nblocks]; // starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i]; starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i]; if (get_block_rank(i, j) == rank) { /* This blocks should be treated by the current MPI process */ /* Allocate and fill it */ starpu_malloc((void **)blockptr, blocksize); allocated_memory += blocksize; //fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j); fill_block_with_random(*blockptr, size, nblocks); //fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j); if (i == j) { unsigned tmp; for (tmp = 0; tmp < size/nblocks; tmp++) { (*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks; } } /* Register it to StarPU */ starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*blockptr, size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } else { *blockptr = STARPU_POISON_PTR; *handleptr = STARPU_POISON_PTR; } } } /* Allocate the temporary buffers required for the distributed algorithm */ unsigned k; /* tmp buffer 11 */ #ifdef SINGLE_TMP11 starpu_malloc((void **)&tmp_11_block, blocksize); allocated_memory_extra += blocksize; starpu_matrix_data_register(&tmp_11_block_handle, STARPU_MAIN_RAM, (uintptr_t)tmp_11_block, size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); #else tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_11_block = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); for (k = 0; k < nblocks; k++) { if (tmp_11_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_11_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_11_block[k]); starpu_matrix_data_register(&tmp_11_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_11_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } } #endif /* tmp buffers 12 and 21 */ #ifdef SINGLE_TMP1221 tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_12_block = calloc(nblocks, sizeof(TYPE *)); tmp_21_block = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); #else for (i = 0; i < 2; i++) { tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *)); tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); } #endif for (k = 0; k < nblocks; k++) { #ifdef SINGLE_TMP1221 if (tmp_12_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_12_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_12_block[k]); starpu_matrix_data_register(&tmp_12_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_12_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } if (tmp_21_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_21_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_21_block[k]); starpu_matrix_data_register(&tmp_21_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_21_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } #else for (i = 0; i < 2; i++) { if (tmp_12_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_12_block[i][k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_12_block[i][k]); starpu_matrix_data_register(&tmp_12_block_handles[i][k], STARPU_MAIN_RAM, (uintptr_t)tmp_12_block[i][k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } if (tmp_21_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_21_block[i][k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_21_block[i][k]); starpu_matrix_data_register(&tmp_21_block_handles[i][k], STARPU_MAIN_RAM, (uintptr_t)tmp_21_block[i][k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } } #endif } //display_all_blocks(nblocks, size/nblocks); }