Beispiel #1
0
int main(int argc, char *argv[])
{
    int i,j, num_procs, my_rank, ptr_a=0, ptr_b=0;
    int my_2drank, up_rank, down_rank, left_rank, right_rank, shift_source, shift_dest;
    int procs_dim[2], my_coords[2], periods[2], start[2]= {0,0};
    int *displs_a, *displs_b, *sendCounts, local_size_a[2][2], local_size_b[2][2];
    int alloc_row_a, alloc_col_a, alloc_row_b, alloc_col_b;
    int global_size_a[2],global_size_b[2], local_size_c[2];
    double **a_buffers[2], **b_buffers[2], **a_global, **b_global, **c_global, **c_local;

    double *matrix_a_ptr=NULL, *matrix_b_ptr=NULL, *matrix_c_ptr=NULL;

    MPI_Status status;
    MPI_Comm comm_2d;
    MPI_Request reqs[4];

    //MPI_struct datatypes
    MPI_Init (&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    //equal number of processes in x and y dimension
    procs_dim[0] = procs_dim[1] = sqrt(num_procs);

    //Setting wraparound to true that is periods different than 0
    periods[0] = periods[1] = 1;

    /*Create a new topology, setting reorder to true to get a new rank in 2d communicator*/
    MPI_Cart_create(MPI_COMM_WORLD, 2, procs_dim, periods, 1, &comm_2d);

    MPI_Comm_rank(comm_2d, &my_2drank);
    MPI_Cart_coords(comm_2d, my_2drank, 2, my_coords);

    //Get the ranks to the left processes and above
    MPI_Cart_shift(comm_2d, 1, -1, &right_rank, &left_rank);
    MPI_Cart_shift(comm_2d, 0, -1, &down_rank, &up_rank);


    if (my_rank == 0) {
        read_matrix_binaryformat((char *)argv[1], &a_global, &global_size_a[0], &global_size_a[1]);
        read_matrix_binaryformat((char *)argv[2], &b_global, &global_size_b[0], &global_size_b[1]);
    }

    MPI_Bcast (&global_size_a[0], 1, MPI_INT, 0, comm_2d);
    MPI_Bcast (&global_size_a[1], 1, MPI_INT, 0, comm_2d);
    MPI_Bcast (&global_size_b[0], 1, MPI_INT, 0, comm_2d);
    MPI_Bcast (&global_size_b[1], 1, MPI_INT, 0, comm_2d);

    local_size_a[1][0] = local_size_c[0] = (my_coords[0] < procs_dim[0]-1) ? global_size_a[0]/procs_dim[0]
                                           : global_size_a[0]/procs_dim[0] + (global_size_a[0]%procs_dim[0]);
    local_size_a[1][1]= (my_coords[1] < procs_dim[1]-1) ? global_size_a[1]/procs_dim[1]
                        :  global_size_a[1]/procs_dim[1]+(global_size_a[1]%procs_dim[1]);
    local_size_b[1][1] = local_size_c[1] = (my_coords[1] < procs_dim[1]-1) ? global_size_a[0]/procs_dim[1]
                                           : global_size_a[0]/procs_dim[1] + global_size_a[0]%procs_dim[1];
    local_size_b[1][0] = (my_coords[0] < procs_dim[1]-1) ? global_size_b[0]/procs_dim[1] : global_size_b[0]/procs_dim[0]
                         + (global_size_b[0]%procs_dim[0]);
    //printf("rank : %d i : %d j : %d\n",my_2drank,my_coords[0],my_coords[1]);
    //printf("rank : %d c0 : %d c1 : %d\n",my_2drank, local_size_c[0], local_size_c[1]);
    //printf("rank : %d a00 : %d a01 : %d b10 : %d b11 : %d c0 : %d c1 : %d\n",my_2drank, local_size_a[1][0], local_size_a[1][1],local_size_b[1][0], local_size_b[1][1], local_size_c[0], local_size_c[1]);
    allocate_matrix(&a_buffers[1], local_size_a[1][0], local_size_a[1][1]);
    allocate_matrix(&b_buffers[1], local_size_b[1][0], local_size_b[1][1]);
    allocate_matrix(&c_local, local_size_c[0], local_size_c[1]);

    if (my_rank == 0) {
        //The pointers are different than NULL only at root
        matrix_a_ptr = &(a_global[0][0]);
        matrix_b_ptr = &(b_global[0][0]);

        sendCounts = (int *)malloc(num_procs*sizeof(int));
        displs_a = (int *)malloc(num_procs*sizeof(int));
        displs_b = (int *)malloc(num_procs*sizeof(int));
        for(i = 0; i < procs_dim[0]; i++) {
            for(j = 0; j < procs_dim[1]; j++) {
                sendCounts[i*procs_dim[0]+j] = 1;
                displs_a[i*procs_dim[0]+j] = ptr_a;
                displs_b[i*procs_dim[0]+j] = ptr_b;
                ptr_a += 1;
                ptr_b += 1;
            }
            ptr_a += procs_dim[1]*(global_size_a[0]/procs_dim[0] - 1);
            ptr_b += procs_dim[1]*(global_size_b[0]/procs_dim[1] - 1);
        }
    }


    distribute_matrix(a_buffers[1], matrix_a_ptr, sendCounts, displs_a, global_size_a, local_size_a[1]);
    distribute_matrix(b_buffers[1], matrix_b_ptr, sendCounts, displs_b, global_size_b, local_size_b[1]);

    MPI_Cart_shift(comm_2d, 1, -my_coords[0], &shift_source, &shift_dest);

    MPI_Sendrecv(local_size_a[1], 2, MPI_INT, shift_dest, 1, local_size_a[0], 2, MPI_INT, shift_source, 1, comm_2d, &status);
    allocate_matrix(&a_buffers[0], local_size_a[0][0], local_size_a[0][1]);
    MPI_Sendrecv(&(a_buffers[1][0][0]), local_size_a[1][0]*local_size_a[1][1], MPI_DOUBLE,  shift_dest, 1, &(a_buffers[0][0][0]),
                 local_size_a[0][0]*local_size_a[0][1], MPI_DOUBLE, shift_source, 1, comm_2d, &status);

    MPI_Cart_shift(comm_2d, 0, -my_coords[1], &shift_source, &shift_dest);
    printf("rank : %d ss : %d sd : %d\n",my_2drank, shift_source, shift_dest);
    MPI_Sendrecv(local_size_b[1], 2, MPI_INT, shift_dest, 1, local_size_b[0], 2, MPI_INT, shift_source, 1, comm_2d, &status);
    allocate_matrix(&b_buffers[0], local_size_b[0][0], local_size_b[0][1]);
    MPI_Sendrecv(&(b_buffers[1][0][0]), local_size_b[1][0]*local_size_b[1][1], MPI_DOUBLE,  shift_dest, 1, &(b_buffers[0][0][0]),
                 local_size_b[0][0]*local_size_b[0][1], MPI_DOUBLE, shift_source, 1, comm_2d, &status);


    for (i=0; i<procs_dim[0]; i++) {
        //Sending and receiving the new submatrix size from the right and bottom and if different deallocate and allocate again.
        MPI_Sendrecv(local_size_a[i%2], 2, MPI_INT, left_rank, 1, local_size_a[(i+1)%2], 2, MPI_INT, right_rank, 1, comm_2d, &status);
        deallocate_matrix(&a_buffers[(i+1)%2]);
        allocate_matrix(&a_buffers[(i+1)%2],local_size_a[(i+1)%2][0], local_size_a[(i+1)%2][1]);

        MPI_Sendrecv(local_size_b[i%2], 2, MPI_INT, up_rank, 1, local_size_b[(i+1)%2], 2, MPI_INT, down_rank, 1, comm_2d, &status);
        deallocate_matrix(&b_buffers[(i+1)%2]);
        allocate_matrix(&b_buffers[(i+1)%2],local_size_b[(i+1)%2][0], local_size_b[(i+1)%2][1]);


        MPI_Isend(&(a_buffers[i%2][0][0]), local_size_a[i%2][0]*local_size_a[i%2][1], MPI_DOUBLE, left_rank, 1, comm_2d, &reqs[0]);
        MPI_Isend(&(b_buffers[i%2][0][0]), local_size_b[i%2][0]*local_size_b[i%2][1], MPI_DOUBLE, up_rank, 1, comm_2d, &reqs[1]);
        MPI_Irecv(&(a_buffers[(i+1)%2][0][0]), local_size_a[(i+1)%2][0]*local_size_a[(i+1)%2][1], MPI_DOUBLE, right_rank, 1 ,comm_2d, &reqs[2]);
        MPI_Irecv(&(b_buffers[(i+1)%2][0][0]), local_size_b[(i+1)%2][0]*local_size_b[(i+1)%2][1], MPI_DOUBLE, down_rank, 1 ,comm_2d, &reqs[3]);

        matrix_multiply(local_size_c, local_size_a[i%2][1], a_buffers[i%2], b_buffers[i%2], c_local);
        for(j=0; j<4; j++)
            MPI_Wait(&reqs[j],&status);
    }

    if (my_rank == 0) {
        deallocate_matrix(&a_global);
        deallocate_matrix(&b_global);
        allocate_matrix(&c_global, global_size_a[0], global_size_b[1]);
        matrix_c_ptr = &(c_global[0][0]);
    }

    gather_submatrices(c_local, matrix_c_ptr, sendCounts, displs_a, global_size_a[0], local_size_c, my_2drank);


    deallocate_matrix(&a_buffers[0]);
    deallocate_matrix(&a_buffers[1]);
    deallocate_matrix(&b_buffers[0]);
    deallocate_matrix(&b_buffers[1]);
    deallocate_matrix(&c_local);
    MPI_Comm_free(&comm_2d);
    MPI_Finalize ();
    return 0;
}
/**
 * Creates random A, B, and C matrices and uses summa() to
 *  calculate the product. Output of summa() is compared 
 *  to CC, the true solution.
 **/
bool random_matrix_test(int m, int n, int k, int px, int py, int panel_size) {
  int proc = 0, passed_test = 0, group_passed = 0;
  int num_procs = px * py;
  int rank = 0;
  double *A, *B, *C, *CC, *A_block, *B_block, *C_block, *CC_block;

  A = NULL;
  B = NULL;
  C = NULL;
  CC = NULL;

  MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* Get process id */

  if (rank == 0) {
    /* Allocate matrices */
    A = random_matrix(m, k);
    B = random_matrix(k, n);
    C = zeros_matrix(m, n);

    /* Stores the solution */
    CC = zeros_matrix(m, n);

    /* 
     * Solve the problem locally and store the
     *  solution in CC
     */
    local_mm(m, n, k, 1.0, A, m, B, k, 0.0, CC, m);
  }

  /* 
   * Allocate memory for matrix blocks 
   */
  A_block = malloc(sizeof(double) * (m * k) / num_procs);
  assert(A_block);

  B_block = malloc(sizeof(double) * (k * n) / num_procs);
  assert(B_block);

  C_block = malloc(sizeof(double) * (m * n) / num_procs);
  assert(C_block);

  CC_block = malloc(sizeof(double) * (m * n) / num_procs);
  assert(CC_block);

  /* Distrute the matrices */
  distribute_matrix(px, py, m, k, A, A_block, rank);
  distribute_matrix(px, py, k, n, B, B_block, rank);
  distribute_matrix(px, py, m, n, C, C_block, rank);
  distribute_matrix(px, py, m, n, CC, CC_block, rank);

  if (rank == 0) {

    /* 
     * blocks of A, B, C, and CC have been distributed to
     * each of the processes, now we can safely deallocate the 
     * matrices
     */
    deallocate_matrix(A);
    deallocate_matrix(B);
    deallocate_matrix(C);
    deallocate_matrix(CC);
  }

#ifdef DEBUG
  /* flush output and synchronize the processes */
  fflush(stdout);
  sleep(1);
  MPI_Barrier( MPI_COMM_WORLD);
#endif

  /* 
   *
   * Call SUMMA
   *
   */

  summa(m, n, k, A_block, B_block, C_block, px, py, 1);

#ifdef DEBUG
  /* flush output and synchronize the processes */
  fflush(stdout);
  sleep(1);
  MPI_Barrier( MPI_COMM_WORLD);
#endif

#ifdef DEBUG
  /* Verify each C_block sequentially */
  for (proc=0; proc < num_procs; proc++) {

    if (rank == proc) {

      bool isCorrect = verify_matrix_bool(m / px, n / py, C_block, CC_block);

      if (isCorrect) {
        printf("CBlock on rank=%d is correct\n",rank);
        fflush(stdout);
      } else {
        printf("**\tCBlock on rank=%d is wrong\n",rank);

        printf("CBlock on rank=%d is\n",rank);
        print_matrix(m / px, n / py, C_block);

        printf("CBlock on rank=%d should be\n",rank);
        print_matrix(m / px, n / py, CC_block);

        printf("**\n\n");
        fflush(stdout);

        passed_test = 1;
        sleep(1);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD); /* keep all processes synchronized */
  }/* proc */

#else

  /* each process will verify its C_block in parallel */
  if (verify_matrix_bool(m / px, n / py, C_block, CC_block) == false) {
    passed_test = 1;
  }

#endif

  /* free A_block, B_block, C_block, and CC_block */
  free(A_block);
  free(B_block);
  free(C_block);
  free(CC_block);

  /*
   *
   *  passed_test == 0 if the process PASSED the test
   *  passed_test == 1 if the process FAILED the test
   *  
   *  therefore a MPI_Reduce of passed_test will count the
   *   number of processes that failed
   *  
   *  After the MPI_Reduce/MPI_Scatter, if group_passed == 0 then every process passed
   */

  MPI_Reduce(&passed_test, &group_passed, 1, MPI_INT, MPI_SUM, 0,
      MPI_COMM_WORLD);
  MPI_Bcast(&group_passed, 1, MPI_INT, 0, MPI_COMM_WORLD);

  if (rank == 0 && group_passed == 0) {
    printf(
        "random_matrix_test m=%d n=%d k=%d px=%d py=%d pb=%d............PASSED\n",
        m, n, k, px, py, panel_size);
  }

  if (rank == 0 && group_passed != 0) {
    printf(
        "random_matrix_test m=%d n=%d k=%d px=%d py=%d pb=%d............FAILED\n",
        m, n, k, px, py, panel_size);
  }

  /* If group_passed==0 then every process passed the test*/
  if (group_passed == 0) {
    return true;
  } else {
    return false;
  }
}