void thread_entry(int cid, int nc) { coreid = cid; ncores = nc; // static allocates data in the binary, which is visible to both threads static data_t results_data[ARRAY_SIZE]; // Execute the provided, naive matmul barrier(); stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier()); // verify int res = verifyDouble(ARRAY_SIZE, results_data, verify_data); if (res) exit(res); #if 0 // clear results from the first trial size_t i; if (coreid == 0) for (i=0; i < ARRAY_SIZE; i++) results_data[i] = 0; barrier(); // Execute your faster matmul barrier(); stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier()); #ifdef DEBUG printArray("results:", ARRAY_SIZE, results_data); printArray("verify :", ARRAY_SIZE, verify_data); #endif // verify res = verify(ARRAY_SIZE, results_data, verify_data); if (res) exit(res); barrier(); #endif exit(0); }
int main(int argc, char *argv[]) { MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); parse_cmdline(argc, argv); // assume for SUMMA simplicity that nprocs is perfect square // and allow only this nproc int nprocs; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); int n_proc_rows = sqrt(nprocs); int n_proc_cols = n_proc_rows; if (n_proc_cols * n_proc_rows != nprocs) { fprintf(stderr, "ERROR: number of proccessors must be a perfect square!\n"); MPI_Abort(MPI_COMM_WORLD, 1); } // create 2D cartesian communicator from `nprocs` procs int ndims = 2; const int dims[2]; const int periods[2] = {0, 0}; int reorder = 0; MPI_Comm comm_cart; // ======== YOUR CODE HERE ============================ // Create 2D cartesian communicator using MPI_Cart_Create function // MPI_COMM_WORLD is your initial communicator // We do not need periodicity in dimensions for SUMMA, so we set periods to 0. // We also do not need to reorder ranking, so we set reorder to 0 too. // // Dimensions of the new communicator should be [n_proc_rows, n_proc_cols]. // New communicator with Cartesian topology should be assigned to // variable `comm_cart`. // // MPI_Cart_create(... YOUR CODE HERE ...); // ==================================================== // assume for simplicity that matrix dims are dividable by proc grid size // each proc determines its local block sizes int mb = m / n_proc_rows; int nb = n / n_proc_cols; // == n / n_proc_rows int kb = k / n_proc_cols; if (mb * n_proc_rows != m) { fprintf(stderr, "ERROR: m must be dividable by n_proc_rows\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if (nb * n_proc_cols != n) { fprintf(stderr, "ERROR: n must be dividable by n_proc_cols\n"); MPI_Abort(MPI_COMM_WORLD, 1); } if (kb * n_proc_cols != k) { fprintf(stderr, "ERROR: k must be dividable by n_proc_cols\n"); MPI_Abort(MPI_COMM_WORLD, 1); } // each processor allocates memory for local portions of A, B and C double *A_loc = NULL; double *B_loc = NULL; double *C_loc = NULL; A_loc = (double *) calloc(mb * nb, sizeof(double)); B_loc = (double *) calloc(nb * kb, sizeof(double)); C_loc = (double *) calloc(mb * kb, sizeof(double)); #ifdef CHECK_NUMERICS // rank 0 allocates matrices A_glob, B_glob, C_glob, C_glob_naive for checking double *A_glob = NULL; double *B_glob = NULL; double *C_glob = NULL; double *C_glob_naive = NULL; if (myrank == 0) { A_glob = (double *) calloc(m * n, sizeof(double)); B_glob = (double *) calloc(n * k, sizeof(double)); C_glob = (double *) calloc(m * k, sizeof(double)); C_glob_naive = (double *) calloc(m * k, sizeof(double)); } #endif // init matrices: fill A_loc and B_loc with random values // in real life A_loc and B_loc are calculated by each proc // from e.g. partial differential equations init_matrix(A_loc, mb, nb); init_matrix(B_loc, nb, kb); // gather A_glob, B_glob for further checking #ifdef CHECK_NUMERICS gather_glob(mb, nb, A_loc, m, n, A_glob); gather_glob(nb, kb, B_loc, n, k, B_glob); #endif // call SUMMA and measure execution time using tstart, tend double tstart, tend; tstart = MPI_Wtime(); // You should implement SUMMA algorithm in SUMMA function. // SUMMA stub function is in this file (see above). SUMMA(comm_cart, mb, nb, kb, A_loc, B_loc, C_loc); tend = MPI_Wtime(); // Each processor will spend different time doing its // portion of work in SUMMA algorithm. To understand how long did // SUMMA execution take overall we should find time of the slowest processor. // We should be using MPI_Reduce function with MPI_MAX operation double etime = tend - tstart; double max_etime = 0.0; // ======== YOUR CODE HERE ============================ // Determine maximum value of `etime` across all processors in MPI_COMM_WORLD // and save it in `max_etime` variable on root processor (rank 0). // Use MPI_Reduce function and MPI_MAX operation. // MPI_Reduce(... YOUR CODE HERE ...); // ==================================================== if (myrank == 0) { printf("SUMMA took %f sec\n", max_etime); } #ifdef CHECK_NUMERICS // gather C_glob gather_glob(mb, kb, C_loc, m, k, C_glob); if (myrank == 0) { matmul_naive(m, n, k, A_glob, B_glob, C_glob_naive); #ifdef DEBUG printf("C_glob_naive:\n"); print_matrix(m, k, C_glob_naive); printf("C_glob:\n"); print_matrix(m, k, C_glob); #endif double eps = validate(n, k, C_glob, C_glob_naive); if (eps > TOL) { fprintf(stderr, "ERROR: eps = %f\n", eps); MPI_Abort(MPI_COMM_WORLD, 1); } else { printf("SUMMA: OK: eps = %f\n", eps); } } free(A_glob); free(B_glob); free(C_glob); free(C_glob_naive); #endif // deallocate matrices free(A_loc); free(B_loc); free(C_loc); MPI_Finalize(); return 0; }