Esempio n. 1
0
void thread_entry(int cid, int nc)
{
   coreid = cid;
   ncores = nc;

   // static allocates data in the binary, which is visible to both threads
   static data_t results_data[ARRAY_SIZE];


   // Execute the provided, naive matmul
   barrier();
   stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
 
   
   // verify
   int res = verifyDouble(ARRAY_SIZE, results_data, verify_data);
   if (res)
      exit(res);

#if 0
   // clear results from the first trial
   size_t i;
   if (coreid == 0) 
      for (i=0; i < ARRAY_SIZE; i++)
         results_data[i] = 0;
   barrier();

   
   // Execute your faster matmul
   barrier();
   stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
 
#ifdef DEBUG
   printArray("results:", ARRAY_SIZE, results_data);
   printArray("verify :", ARRAY_SIZE, verify_data);
#endif
   
   // verify
   res = verify(ARRAY_SIZE, results_data, verify_data);
   if (res)
      exit(res);
   barrier();
#endif

   exit(0);
}
Esempio n. 2
0
int main(int argc, char *argv[]) {

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    parse_cmdline(argc, argv);

    // assume for SUMMA simplicity that nprocs is perfect square
    // and allow only this nproc
    int nprocs;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

    int n_proc_rows = sqrt(nprocs);
    int n_proc_cols = n_proc_rows;
    if (n_proc_cols * n_proc_rows != nprocs) {
        fprintf(stderr, "ERROR: number of proccessors must be a perfect square!\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    // create 2D cartesian communicator from `nprocs` procs
    int ndims = 2;
    const int dims[2];
    const int periods[2] = {0, 0};
    int reorder = 0;
    MPI_Comm comm_cart;
    // ======== YOUR CODE HERE ============================
    // Create 2D cartesian communicator using MPI_Cart_Create function
    // MPI_COMM_WORLD is your initial communicator
    // We do not need periodicity in dimensions for SUMMA, so we set periods to 0.
    // We also do not need to reorder ranking, so we set reorder to 0 too.
    //
    // Dimensions of the new communicator should be [n_proc_rows, n_proc_cols].
    // New communicator with Cartesian topology should be assigned to
    // variable `comm_cart`.
    //
    // MPI_Cart_create(... YOUR CODE HERE ...);
    // ====================================================


    // assume for simplicity that matrix dims are dividable by proc grid size
    // each proc determines its local block sizes
    int mb = m / n_proc_rows;
    int nb = n / n_proc_cols; // == n / n_proc_rows
    int kb = k / n_proc_cols;
    if (mb * n_proc_rows != m) {
        fprintf(stderr, "ERROR: m must be dividable by n_proc_rows\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    if (nb * n_proc_cols != n) {
        fprintf(stderr, "ERROR: n must be dividable by n_proc_cols\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    if (kb * n_proc_cols != k) {
        fprintf(stderr, "ERROR: k must be dividable by n_proc_cols\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    // each processor allocates memory for local portions of A, B and C
    double *A_loc = NULL;
    double *B_loc = NULL;
    double *C_loc = NULL;
    A_loc = (double *) calloc(mb * nb, sizeof(double));
    B_loc = (double *) calloc(nb * kb, sizeof(double));
    C_loc = (double *) calloc(mb * kb, sizeof(double));

#ifdef CHECK_NUMERICS
    // rank 0 allocates matrices A_glob, B_glob, C_glob, C_glob_naive for checking
    double *A_glob = NULL;
    double *B_glob = NULL;
    double *C_glob = NULL;
    double *C_glob_naive = NULL;
    if (myrank == 0) {
        A_glob = (double *) calloc(m * n, sizeof(double));
        B_glob = (double *) calloc(n * k, sizeof(double));
        C_glob = (double *) calloc(m * k, sizeof(double));
        C_glob_naive = (double *) calloc(m * k, sizeof(double));
    }
#endif

    // init matrices: fill A_loc and B_loc with random values
    // in real life A_loc and B_loc are calculated by each proc 
    // from e.g. partial differential equations
    init_matrix(A_loc, mb, nb);
    init_matrix(B_loc, nb, kb);

    // gather A_glob, B_glob for further checking
#ifdef CHECK_NUMERICS
    gather_glob(mb, nb, A_loc, m, n, A_glob);
    gather_glob(nb, kb, B_loc, n, k, B_glob);
#endif

    // call SUMMA and measure execution time using tstart, tend
    double tstart, tend;
    tstart = MPI_Wtime();

    // You should implement SUMMA algorithm in SUMMA function.
    // SUMMA stub function is in this file (see above).
    SUMMA(comm_cart, mb, nb, kb, A_loc, B_loc, C_loc);

    tend = MPI_Wtime();

    // Each processor will spend different time doing its 
    // portion of work in SUMMA algorithm. To understand how long did 
    // SUMMA execution take overall we should find time of the slowest processor.
    // We should be using MPI_Reduce function with MPI_MAX operation
    double etime = tend - tstart;
    double max_etime = 0.0;
    
    // ======== YOUR CODE HERE ============================
    // Determine maximum value of `etime` across all processors in MPI_COMM_WORLD
    // and save it in `max_etime` variable on root processor (rank 0).
    // Use MPI_Reduce function and MPI_MAX operation.
    // MPI_Reduce(... YOUR CODE HERE ...);
    // ====================================================
    if (myrank == 0) {
        printf("SUMMA took %f sec\n", max_etime);
    }
    
#ifdef CHECK_NUMERICS    
    // gather C_glob
    gather_glob(mb, kb, C_loc, m, k, C_glob);

    if (myrank == 0) {
        matmul_naive(m, n, k, A_glob, B_glob, C_glob_naive);

#ifdef DEBUG
        printf("C_glob_naive:\n");
        print_matrix(m, k, C_glob_naive);
        printf("C_glob:\n");
        print_matrix(m, k, C_glob);
#endif
        
        double eps = validate(n, k, C_glob, C_glob_naive);
        if (eps > TOL) {
            fprintf(stderr, "ERROR: eps = %f\n", eps);
            MPI_Abort(MPI_COMM_WORLD, 1);
        } else {
            printf("SUMMA: OK: eps = %f\n", eps);
        }
    }

    free(A_glob);
    free(B_glob);
    free(C_glob);
    free(C_glob_naive);
#endif

    // deallocate matrices
    free(A_loc);
    free(B_loc);
    free(C_loc);

    MPI_Finalize();
    return 0;
}