Example #1
0
int main(int argc, char *argv[])
{
	int *matrix_a;
	int *matrix_b;
	int *matrix_c;

	const char *matrix_a_filename = argv[1];
	const char *matrix_b_filename = argv[2];
	const char *matrix_c_filename = argv[3];

	MPI_Comm matrix_comm;

	MPI_Init(&argc, &argv);

	create_matrix_comm(MPI_COMM_WORLD, &matrix_comm);
	MPI_Comm_size(matrix_comm, &size);
	MPI_Comm_rank(matrix_comm, &rank);

	compute_matrixes_variables(matrix_a_filename, matrix_comm);

	alloc_submatrix_buffer(&matrix_a);
	alloc_submatrix_buffer(&matrix_b);
	alloc_submatrix_buffer(&matrix_c);

	distribute_matrix(matrix_a_filename, matrix_a, matrix_comm);
	distribute_matrix(matrix_b_filename, matrix_b, matrix_comm);

	/* The actual cannon algorithms */
	int row_source, row_dst;
	int col_source, col_dst;
	MPI_Cart_shift(matrix_comm, 0, -1, &row_source, &row_dst);
	MPI_Cart_shift(matrix_comm, 1, -1, &col_source, &col_dst);
	int i;
	for (i = 0; i < pp_dims; i++) {
		compute_matrix_mul(matrix_a, matrix_b, matrix_c, N);
		MPI_Sendrecv_replace(matrix_a, sub_n * sub_n, MPI_INT,
				     row_source, MPI_ANY_TAG, row_dst, MPI_ANY_TAG,
				     matrix_comm, MPI_STATUS_IGNORE);

		MPI_Sendrecv_replace(matrix_b, sub_n * sub_n, MPI_INT,
				     col_source, MPI_ANY_TAG, col_dst, MPI_ANY_TAG,
				     matrix_comm, MPI_STATUS_IGNORE);
	}


	write_result(matrix_c_filename, matrix_c, matrix_comm);

	free(matrix_a);
	free(matrix_b);
	free(matrix_c);

	MPI_Comm_free(&matrix_comm);

	MPI_Finalize();
	return 0;
}
Example #2
0
int main(int argc, char *argv[])
{
    int i,j, num_procs, my_rank, ptr_a=0, ptr_b=0;
    int my_2drank, up_rank, down_rank, left_rank, right_rank, shift_source, shift_dest;
    int procs_dim[2], my_coords[2], periods[2], start[2]= {0,0};
    int *displs_a, *displs_b, *sendCounts, local_size_a[2][2], local_size_b[2][2];
    int alloc_row_a, alloc_col_a, alloc_row_b, alloc_col_b;
    int global_size_a[2],global_size_b[2], local_size_c[2];
    double **a_buffers[2], **b_buffers[2], **a_global, **b_global, **c_global, **c_local;

    double *matrix_a_ptr=NULL, *matrix_b_ptr=NULL, *matrix_c_ptr=NULL;

    MPI_Status status;
    MPI_Comm comm_2d;
    MPI_Request reqs[4];

    //MPI_struct datatypes
    MPI_Init (&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    //equal number of processes in x and y dimension
    procs_dim[0] = procs_dim[1] = sqrt(num_procs);

    //Setting wraparound to true that is periods different than 0
    periods[0] = periods[1] = 1;

    /*Create a new topology, setting reorder to true to get a new rank in 2d communicator*/
    MPI_Cart_create(MPI_COMM_WORLD, 2, procs_dim, periods, 1, &comm_2d);

    MPI_Comm_rank(comm_2d, &my_2drank);
    MPI_Cart_coords(comm_2d, my_2drank, 2, my_coords);

    //Get the ranks to the left processes and above
    MPI_Cart_shift(comm_2d, 1, -1, &right_rank, &left_rank);
    MPI_Cart_shift(comm_2d, 0, -1, &down_rank, &up_rank);


    if (my_rank == 0) {
        read_matrix_binaryformat((char *)argv[1], &a_global, &global_size_a[0], &global_size_a[1]);
        read_matrix_binaryformat((char *)argv[2], &b_global, &global_size_b[0], &global_size_b[1]);
    }

    MPI_Bcast (&global_size_a[0], 1, MPI_INT, 0, comm_2d);
    MPI_Bcast (&global_size_a[1], 1, MPI_INT, 0, comm_2d);
    MPI_Bcast (&global_size_b[0], 1, MPI_INT, 0, comm_2d);
    MPI_Bcast (&global_size_b[1], 1, MPI_INT, 0, comm_2d);

    local_size_a[1][0] = local_size_c[0] = (my_coords[0] < procs_dim[0]-1) ? global_size_a[0]/procs_dim[0]
                                           : global_size_a[0]/procs_dim[0] + (global_size_a[0]%procs_dim[0]);
    local_size_a[1][1]= (my_coords[1] < procs_dim[1]-1) ? global_size_a[1]/procs_dim[1]
                        :  global_size_a[1]/procs_dim[1]+(global_size_a[1]%procs_dim[1]);
    local_size_b[1][1] = local_size_c[1] = (my_coords[1] < procs_dim[1]-1) ? global_size_a[0]/procs_dim[1]
                                           : global_size_a[0]/procs_dim[1] + global_size_a[0]%procs_dim[1];
    local_size_b[1][0] = (my_coords[0] < procs_dim[1]-1) ? global_size_b[0]/procs_dim[1] : global_size_b[0]/procs_dim[0]
                         + (global_size_b[0]%procs_dim[0]);
    //printf("rank : %d i : %d j : %d\n",my_2drank,my_coords[0],my_coords[1]);
    //printf("rank : %d c0 : %d c1 : %d\n",my_2drank, local_size_c[0], local_size_c[1]);
    //printf("rank : %d a00 : %d a01 : %d b10 : %d b11 : %d c0 : %d c1 : %d\n",my_2drank, local_size_a[1][0], local_size_a[1][1],local_size_b[1][0], local_size_b[1][1], local_size_c[0], local_size_c[1]);
    allocate_matrix(&a_buffers[1], local_size_a[1][0], local_size_a[1][1]);
    allocate_matrix(&b_buffers[1], local_size_b[1][0], local_size_b[1][1]);
    allocate_matrix(&c_local, local_size_c[0], local_size_c[1]);

    if (my_rank == 0) {
        //The pointers are different than NULL only at root
        matrix_a_ptr = &(a_global[0][0]);
        matrix_b_ptr = &(b_global[0][0]);

        sendCounts = (int *)malloc(num_procs*sizeof(int));
        displs_a = (int *)malloc(num_procs*sizeof(int));
        displs_b = (int *)malloc(num_procs*sizeof(int));
        for(i = 0; i < procs_dim[0]; i++) {
            for(j = 0; j < procs_dim[1]; j++) {
                sendCounts[i*procs_dim[0]+j] = 1;
                displs_a[i*procs_dim[0]+j] = ptr_a;
                displs_b[i*procs_dim[0]+j] = ptr_b;
                ptr_a += 1;
                ptr_b += 1;
            }
            ptr_a += procs_dim[1]*(global_size_a[0]/procs_dim[0] - 1);
            ptr_b += procs_dim[1]*(global_size_b[0]/procs_dim[1] - 1);
        }
    }


    distribute_matrix(a_buffers[1], matrix_a_ptr, sendCounts, displs_a, global_size_a, local_size_a[1]);
    distribute_matrix(b_buffers[1], matrix_b_ptr, sendCounts, displs_b, global_size_b, local_size_b[1]);

    MPI_Cart_shift(comm_2d, 1, -my_coords[0], &shift_source, &shift_dest);

    MPI_Sendrecv(local_size_a[1], 2, MPI_INT, shift_dest, 1, local_size_a[0], 2, MPI_INT, shift_source, 1, comm_2d, &status);
    allocate_matrix(&a_buffers[0], local_size_a[0][0], local_size_a[0][1]);
    MPI_Sendrecv(&(a_buffers[1][0][0]), local_size_a[1][0]*local_size_a[1][1], MPI_DOUBLE,  shift_dest, 1, &(a_buffers[0][0][0]),
                 local_size_a[0][0]*local_size_a[0][1], MPI_DOUBLE, shift_source, 1, comm_2d, &status);

    MPI_Cart_shift(comm_2d, 0, -my_coords[1], &shift_source, &shift_dest);
    printf("rank : %d ss : %d sd : %d\n",my_2drank, shift_source, shift_dest);
    MPI_Sendrecv(local_size_b[1], 2, MPI_INT, shift_dest, 1, local_size_b[0], 2, MPI_INT, shift_source, 1, comm_2d, &status);
    allocate_matrix(&b_buffers[0], local_size_b[0][0], local_size_b[0][1]);
    MPI_Sendrecv(&(b_buffers[1][0][0]), local_size_b[1][0]*local_size_b[1][1], MPI_DOUBLE,  shift_dest, 1, &(b_buffers[0][0][0]),
                 local_size_b[0][0]*local_size_b[0][1], MPI_DOUBLE, shift_source, 1, comm_2d, &status);


    for (i=0; i<procs_dim[0]; i++) {
        //Sending and receiving the new submatrix size from the right and bottom and if different deallocate and allocate again.
        MPI_Sendrecv(local_size_a[i%2], 2, MPI_INT, left_rank, 1, local_size_a[(i+1)%2], 2, MPI_INT, right_rank, 1, comm_2d, &status);
        deallocate_matrix(&a_buffers[(i+1)%2]);
        allocate_matrix(&a_buffers[(i+1)%2],local_size_a[(i+1)%2][0], local_size_a[(i+1)%2][1]);

        MPI_Sendrecv(local_size_b[i%2], 2, MPI_INT, up_rank, 1, local_size_b[(i+1)%2], 2, MPI_INT, down_rank, 1, comm_2d, &status);
        deallocate_matrix(&b_buffers[(i+1)%2]);
        allocate_matrix(&b_buffers[(i+1)%2],local_size_b[(i+1)%2][0], local_size_b[(i+1)%2][1]);


        MPI_Isend(&(a_buffers[i%2][0][0]), local_size_a[i%2][0]*local_size_a[i%2][1], MPI_DOUBLE, left_rank, 1, comm_2d, &reqs[0]);
        MPI_Isend(&(b_buffers[i%2][0][0]), local_size_b[i%2][0]*local_size_b[i%2][1], MPI_DOUBLE, up_rank, 1, comm_2d, &reqs[1]);
        MPI_Irecv(&(a_buffers[(i+1)%2][0][0]), local_size_a[(i+1)%2][0]*local_size_a[(i+1)%2][1], MPI_DOUBLE, right_rank, 1 ,comm_2d, &reqs[2]);
        MPI_Irecv(&(b_buffers[(i+1)%2][0][0]), local_size_b[(i+1)%2][0]*local_size_b[(i+1)%2][1], MPI_DOUBLE, down_rank, 1 ,comm_2d, &reqs[3]);

        matrix_multiply(local_size_c, local_size_a[i%2][1], a_buffers[i%2], b_buffers[i%2], c_local);
        for(j=0; j<4; j++)
            MPI_Wait(&reqs[j],&status);
    }

    if (my_rank == 0) {
        deallocate_matrix(&a_global);
        deallocate_matrix(&b_global);
        allocate_matrix(&c_global, global_size_a[0], global_size_b[1]);
        matrix_c_ptr = &(c_global[0][0]);
    }

    gather_submatrices(c_local, matrix_c_ptr, sendCounts, displs_a, global_size_a[0], local_size_c, my_2drank);


    deallocate_matrix(&a_buffers[0]);
    deallocate_matrix(&a_buffers[1]);
    deallocate_matrix(&b_buffers[0]);
    deallocate_matrix(&b_buffers[1]);
    deallocate_matrix(&c_local);
    MPI_Comm_free(&comm_2d);
    MPI_Finalize ();
    return 0;
}
Example #3
0
int main(int argc, char** argv) {
  int rank, size;
  int N;
  char opt;
  int nt = -1;
  int max_threads = 16; // on jupiter

  bool id = false;


  algo_t algo = reduce_scatter;
  FILE *f = NULL;

  static const char optstring[] = "n:a:f:i:p:";
  static const struct option long_options[] = {
		{"n",			1, NULL, 'n'},
    {"file",		1, NULL, 'f'},
    {"i",			1, NULL, 'i'},
		{NULL,			0, NULL, 0}
  };


  MPI_Init(&argc,&argv);

  // get rank and size from communicator
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);

	while ((opt = getopt_long(argc, argv, optstring, long_options, NULL)) != EOF) {
    switch(opt) {
    case 'i':
      if (strcmp("procs", optarg) == 0) {
        id = true;
      }
      break;
    case 'p':
      nt = atoi(optarg);
      if (nt > max_threads) {
        printf("Using too much procs %d, use max %d", nt, max_threads);
        return EXIT_FAILURE;
      } else {
        printf("Using %d procs.", nt);
      }

    case 'n':
      N = atoi(optarg);
      break;    case 'f':
			f = fopen(optarg,"a");
			if (f == NULL) {
				mpi_printf(root, "Could not open log file '%s': %s\n", optarg, strerror(errno));
        MPI_Finalize();
				return  EXIT_FAILURE;
			}
			break;
    case 'a':
      if (strcmp("ref", optarg) == 0) {
        mpi_printf(root, "Using reference implementation \n");
        algo = ref;
      } else if ((strcmp("reduce_scatter", optarg) == 0)) {
        mpi_printf(root, "Using MPI_Allgather implementation \n");
        algo = reduce_scatter;
      }
      break;
    default:
      MPI_Finalize();
      return  EXIT_FAILURE;
    }
  }

  if(N == 0) {
		if ( rank == root ){
      printf("Usage: mpirun -nn nodecount p3-reduce_scatter.exe -n N\n");
      printf("N is the the matrix size. \n\n");
		}
		return 1;
  }


  /* ======================================================== */
  /* Initialisation matrix & vector */

  ATYPE *matrix = NULL;
  ATYPE *vector = NULL;


  if (rank == root) {
    debug("Setting up root data structures");
    matrix = init_matrix(N,1);
    vector = init_vector(N,1);
  }

  int colcnt =  N - (N/size ) * (size - 1 );
  int partition = N/size;

  ATYPE *local_matrix = NULL;
  local_matrix = (ATYPE*) malloc (sizeof(ATYPE) * N * colcnt);


  ATYPE *local_vector = NULL;
  local_vector = (ATYPE*) malloc (sizeof(ATYPE) * partition) ;


  ATYPE *reference = NULL;
  reference = init_vector(N,1);

  ATYPE *result = NULL;
  result = init_vector(N,1);

  double       inittime,totaltime;

  if( algo == ref) {
    if (rank == root) {
      inittime = MPI_Wtime();
      matrix_vector_mult_ref(matrix, vector, N, reference);
      totaltime = MPI_Wtime() - inittime;
    }
  } else if (algo == reduce_scatter) {

    if(rank == root){
      debug("Comptuting reference");
      matrix_vector_mult_ref(matrix, vector, N, reference);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* ======================================================== */
    /* distributing matrix and vector */


    distribute_vector(vector, local_vector, rank, size, partition, N);
    distribute_matrix(matrix, local_matrix, rank, size, partition, N);


    debug("begin MPI_Reduce_scatter");
    MPI_Barrier(MPI_COMM_WORLD);
    inittime = MPI_Wtime();
    compute_reduce_scatter(local_matrix, local_vector, result, rank, size, N, partition);

    MPI_Barrier(MPI_COMM_WORLD);

    totaltime = MPI_Wtime() - inittime;
    double localtime = totaltime;

    MPI_Reduce(&localtime, &totaltime, 1, MPI_DOUBLE, MPI_MAX, root,  MPI_COMM_WORLD);

    debug("after MPI_Reduce_scatter");
  /* TODO: fix test so it uses vector idea  */
    /* debug("Testing result"); */
    /* if (test_vector_part(result, local_vector, (rank * partition) , partition)) { */
    /*   debug("testresult: OK"); */
    /* } else { */
    /*   debug("testresult: FAILURE"); */
    /*   debug("Result:"); */
    /*   printArray(recvbuff, N); */
    /*   debug("Reference:"); */
    /*   printArray(reference,N); */
    /* } */

    MPI_Barrier(MPI_COMM_WORLD);
  }

  if (rank == 0) {
    if (f != NULL) {
      if (id) {
        fprintf(f,"%d,%lf\n",nt, totaltime);
      } else {
        fprintf(f,"%d,%lf\n",N, totaltime);
      }
    }
    if (id) {
      printf("%d,%lf\n",nt , totaltime);
    } else {
      printf("%d,%lf\n",N , totaltime);
    }
  }

  debug("cleaning up");


  free(vector);

  free(matrix);

  MPI_Finalize();

  if ( f != NULL) {
    fclose(f);
  }
  return 0;
}
/**
 * Creates random A, B, and C matrices and uses summa() to
 *  calculate the product. Output of summa() is compared 
 *  to CC, the true solution.
 **/
bool random_matrix_test(int m, int n, int k, int px, int py, int panel_size) {
  int proc = 0, passed_test = 0, group_passed = 0;
  int num_procs = px * py;
  int rank = 0;
  double *A, *B, *C, *CC, *A_block, *B_block, *C_block, *CC_block;

  A = NULL;
  B = NULL;
  C = NULL;
  CC = NULL;

  MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* Get process id */

  if (rank == 0) {
    /* Allocate matrices */
    A = random_matrix(m, k);
    B = random_matrix(k, n);
    C = zeros_matrix(m, n);

    /* Stores the solution */
    CC = zeros_matrix(m, n);

    /* 
     * Solve the problem locally and store the
     *  solution in CC
     */
    local_mm(m, n, k, 1.0, A, m, B, k, 0.0, CC, m);
  }

  /* 
   * Allocate memory for matrix blocks 
   */
  A_block = malloc(sizeof(double) * (m * k) / num_procs);
  assert(A_block);

  B_block = malloc(sizeof(double) * (k * n) / num_procs);
  assert(B_block);

  C_block = malloc(sizeof(double) * (m * n) / num_procs);
  assert(C_block);

  CC_block = malloc(sizeof(double) * (m * n) / num_procs);
  assert(CC_block);

  /* Distrute the matrices */
  distribute_matrix(px, py, m, k, A, A_block, rank);
  distribute_matrix(px, py, k, n, B, B_block, rank);
  distribute_matrix(px, py, m, n, C, C_block, rank);
  distribute_matrix(px, py, m, n, CC, CC_block, rank);

  if (rank == 0) {

    /* 
     * blocks of A, B, C, and CC have been distributed to
     * each of the processes, now we can safely deallocate the 
     * matrices
     */
    deallocate_matrix(A);
    deallocate_matrix(B);
    deallocate_matrix(C);
    deallocate_matrix(CC);
  }

#ifdef DEBUG
  /* flush output and synchronize the processes */
  fflush(stdout);
  sleep(1);
  MPI_Barrier( MPI_COMM_WORLD);
#endif

  /* 
   *
   * Call SUMMA
   *
   */

  summa(m, n, k, A_block, B_block, C_block, px, py, 1);

#ifdef DEBUG
  /* flush output and synchronize the processes */
  fflush(stdout);
  sleep(1);
  MPI_Barrier( MPI_COMM_WORLD);
#endif

#ifdef DEBUG
  /* Verify each C_block sequentially */
  for (proc=0; proc < num_procs; proc++) {

    if (rank == proc) {

      bool isCorrect = verify_matrix_bool(m / px, n / py, C_block, CC_block);

      if (isCorrect) {
        printf("CBlock on rank=%d is correct\n",rank);
        fflush(stdout);
      } else {
        printf("**\tCBlock on rank=%d is wrong\n",rank);

        printf("CBlock on rank=%d is\n",rank);
        print_matrix(m / px, n / py, C_block);

        printf("CBlock on rank=%d should be\n",rank);
        print_matrix(m / px, n / py, CC_block);

        printf("**\n\n");
        fflush(stdout);

        passed_test = 1;
        sleep(1);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD); /* keep all processes synchronized */
  }/* proc */

#else

  /* each process will verify its C_block in parallel */
  if (verify_matrix_bool(m / px, n / py, C_block, CC_block) == false) {
    passed_test = 1;
  }

#endif

  /* free A_block, B_block, C_block, and CC_block */
  free(A_block);
  free(B_block);
  free(C_block);
  free(CC_block);

  /*
   *
   *  passed_test == 0 if the process PASSED the test
   *  passed_test == 1 if the process FAILED the test
   *  
   *  therefore a MPI_Reduce of passed_test will count the
   *   number of processes that failed
   *  
   *  After the MPI_Reduce/MPI_Scatter, if group_passed == 0 then every process passed
   */

  MPI_Reduce(&passed_test, &group_passed, 1, MPI_INT, MPI_SUM, 0,
      MPI_COMM_WORLD);
  MPI_Bcast(&group_passed, 1, MPI_INT, 0, MPI_COMM_WORLD);

  if (rank == 0 && group_passed == 0) {
    printf(
        "random_matrix_test m=%d n=%d k=%d px=%d py=%d pb=%d............PASSED\n",
        m, n, k, px, py, panel_size);
  }

  if (rank == 0 && group_passed != 0) {
    printf(
        "random_matrix_test m=%d n=%d k=%d px=%d py=%d pb=%d............FAILED\n",
        m, n, k, px, py, panel_size);
  }

  /* If group_passed==0 then every process passed the test*/
  if (group_passed == 0) {
    return true;
  } else {
    return false;
  }
}