TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks) { // fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks); TYPE *bigmatrix = calloc(size*size, sizeof(TYPE)); unsigned block_size = size/nblocks; int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); unsigned bi, bj; for (bj = 0; bj < nblocks; bj++) for (bi = 0; bi < nblocks; bi++) { TYPE *block; int block_rank = get_block_rank(bi, bj); if (block_rank == 0) { block = STARPU_PLU(get_block)(bi, bj); } else { MPI_Status status; if (rank == 0) { block = calloc(block_size*block_size, sizeof(TYPE)); int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status); STARPU_ASSERT(ret == MPI_SUCCESS); } else if (rank == block_rank) { block = STARPU_PLU(get_block)(bi, bj); int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD); STARPU_ASSERT(ret == MPI_SUCCESS); } } if (rank == 0) { unsigned j, i; for (j = 0; j < block_size; j++) for (i = 0; i < block_size; i++) { bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] = block[j+i*block_size]; } if (get_block_rank(bi, bj) != 0) free(block); } } return bigmatrix; }
static STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks, TYPE *block_data, TYPE *sub_x, TYPE *sub_y) { unsigned block_size = size/nblocks; /* Take a copy of the upper part of the diagonal block */ TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE)); STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy); STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y); free(upper_block_copy); }
/* x and y must be valid (at least) on 0 */ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank) { unsigned block_size = size/nblocks; /* Send x to everyone */ int bcst_ret; bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD); STARPU_ASSERT(bcst_ret == MPI_SUCCESS); /* Create temporary buffers where all MPI processes are going to * compute Ai x = yi where Ai is the matrix containing the blocks of A * affected to process i, and 0 everywhere else. We then have y as the * sum of all yi. */ TYPE *yi = calloc(size, sizeof(TYPE)); /* Compute Aix = yi */ unsigned long i,j; for (j = 0; j < nblocks; j++) { for (i = 0; i < nblocks; i++) { if (get_block_rank(i, j) == rank) { /* That block belongs to the current MPI process */ TYPE *block_data = STARPU_PLU(get_block)(i, j); TYPE *sub_x = &x[i*block_size]; TYPE *sub_yi = &yi[j*block_size]; STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi); } } } /* Compute the Sum of all yi = y */ MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]); free(yi); }
static void display_grid(int rank, unsigned pnblocks) { if (!display) return; //if (rank == 0) { fprintf(stderr, "2D grid layout (Rank %d): \n", rank); unsigned i, j; for (j = 0; j < pnblocks; j++) { for (i = 0; i < pnblocks; i++) { TYPE *blockptr = STARPU_PLU(get_block)(i, j); starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j); fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle); } fprintf(stderr, "\n"); } } }
void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize) { if (!STARPU_PLU(display_flag)()) return; fprintf(stderr, "DISPLAY BLOCK\n"); unsigned i, j; for (j = 0; j < blocksize; j++) { for (i = 0; i < blocksize; i++) { fprintf(stderr, "%f ", data[j+i*blocksize]); } fprintf(stderr, "\n"); } fprintf(stderr, "****\n"); }
void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved) { TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks); unsigned display = STARPU_PLU(display_flag)(); int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (rank == 0) { TYPE *L = malloc((size_t)size*size*sizeof(TYPE)); TYPE *U = malloc((size_t)size*size*sizeof(TYPE)); memset(L, 0, size*size*sizeof(TYPE)); memset(U, 0, size*size*sizeof(TYPE)); /* only keep the lower part */ unsigned i, j; for (j = 0; j < size; j++) { for (i = 0; i < j; i++) { L[j+i*size] = all_r[j+i*size]; } /* diag i = j */ L[j+j*size] = all_r[j+j*size]; U[j+j*size] = 1.0; for (i = j+1; i < size; i++) { U[j+i*size] = all_r[j+i*size]; } } STARPU_PLU(display_data_content)(L, size); STARPU_PLU(display_data_content)(U, size); /* now A_err = L, compute L*U */ CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size); if (display) fprintf(stderr, "\nLU\n"); STARPU_PLU(display_data_content)(L, size); /* compute "LU - A" in L*/ CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1); TYPE err = CPU_ASUM(size*size, L, 1); int max = CPU_IAMAX(size*size, L, 1); if (display) fprintf(stderr, "DISPLAY ERROR\n"); STARPU_PLU(display_data_content)(L, size); fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size)); fprintf(stderr, "(A - LU) Max error : %e\n", L[max]); double residual = frobenius_norm(L, size); double matnorm = frobenius_norm(Asaved, size); fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size)); } }
void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank) { /* Create temporary buffers where all MPI processes are going to * compute Ui x = yi where Ai is the matrix containing the blocks of U * affected to process i, and 0 everywhere else. We then have y as the * sum of all yi. */ TYPE *yi = calloc(size, sizeof(TYPE)); fprintf(stderr, "Compute LU\n"); unsigned block_size = size/nblocks; /* Compute UiX = Yi */ unsigned long i,j; for (j = 0; j < nblocks; j++) { if (get_block_rank(j, j) == rank) { TYPE *block_data = STARPU_PLU(get_block)(j, j); TYPE *sub_x = &x[j*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi); } for (i = j + 1; i < nblocks; i++) { if (get_block_rank(i, j) == rank) { /* That block belongs to the current MPI process */ TYPE *block_data = STARPU_PLU(get_block)(i, j); TYPE *sub_x = &x[i*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi); } } } /* Grab Sum Yi in X */ MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); memset(yi, 0, size*sizeof(TYPE)); unsigned ind; // if (rank == 0) // { // fprintf(stderr, "INTERMEDIATE\n"); // for (ind = 0; ind < STARPU_MIN(10, size); ind++) // { // fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]); // } // fprintf(stderr, "****\n"); // } /* Everyone needs x */ int bcst_ret; bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD); STARPU_ASSERT(bcst_ret == MPI_SUCCESS); /* Compute LiX = Yi (with X = UX) */ for (j = 0; j < nblocks; j++) { if (j > 0) for (i = 0; i < j; i++) { if (get_block_rank(i, j) == rank) { /* That block belongs to the current MPI process */ TYPE *block_data = STARPU_PLU(get_block)(i, j); TYPE *sub_x = &x[i*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi); } } if (get_block_rank(j, j) == rank) { TYPE *block_data = STARPU_PLU(get_block)(j, j); TYPE *sub_x = &x[j*(block_size)]; TYPE *sub_yi = &yi[j*(block_size)]; STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi); } } /* Grab Sum Yi in Y */ MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD); free(yi); }
int main(int argc, char **argv) { int rank; int world_size; /* * Initialization */ int thread_support; if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) { fprintf(stderr,"MPI_Init_thread failed\n"); exit(1); } if (thread_support == MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); if (thread_support < MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI does not have thread support!\n"); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); starpu_srand48((long int)time(NULL)); parse_args(rank, argc, argv); int ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* We disable sequential consistency in this example */ starpu_data_set_default_sequential_consistency_flag(0); starpu_mpi_init(NULL, NULL, 0); STARPU_ASSERT(p*q == world_size); starpu_cublas_init(); int barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); /* * Problem Init */ init_matrix(rank); fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank, (int)(allocated_memory/(1024*1024)), (int)(allocated_memory_extra/(1024*1024)), (int)((allocated_memory+allocated_memory_extra)/(1024*1024))); display_grid(rank, nblocks); TYPE *a_r = NULL; // STARPU_PLU(display_data_content)(a_r, size); TYPE *x, *y; if (check) { x = calloc(size, sizeof(TYPE)); STARPU_ASSERT(x); y = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { unsigned ind; for (ind = 0; ind < size; ind++) x[ind] = (TYPE)starpu_drand48(); } a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks); if (rank == 0) STARPU_PLU(display_data_content)(a_r, size); // STARPU_PLU(compute_ax)(size, x, y, nblocks, rank); } barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size); /* * Report performance */ int reduce_ret; double min_timing = timing; double max_timing = timing; double sum_timing = timing; reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); if (rank == 0) { fprintf(stderr, "Computation took: %f ms\n", max_timing/1000); fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000); fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000); fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000)); unsigned n = size; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f)); } /* * Test Result Correctness */ if (check) { /* * Compute || A - LU || */ STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r); #if 0 /* * Compute || Ax - LUx || */ unsigned ind; y2 = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { for (ind = 0; ind < size; ind++) { y2[ind] = (TYPE)0.0; } } STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank); /* Compute y2 = y2 - y */ CPU_AXPY(size, -1.0, y, 1, y2, 1); TYPE err = CPU_ASUM(size, y2, 1); int max = CPU_IAMAX(size, y2, 1); fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size)); fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]); #endif } /* * Termination */ barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); starpu_cublas_shutdown(); starpu_mpi_shutdown(); starpu_shutdown(); #if 0 MPI_Finalize(); #endif return 0; }