int main(int argc, char *argv[]) { int *matrix_a; int *matrix_b; int *matrix_c; const char *matrix_a_filename = argv[1]; const char *matrix_b_filename = argv[2]; const char *matrix_c_filename = argv[3]; MPI_Comm matrix_comm; MPI_Init(&argc, &argv); create_matrix_comm(MPI_COMM_WORLD, &matrix_comm); MPI_Comm_size(matrix_comm, &size); MPI_Comm_rank(matrix_comm, &rank); compute_matrixes_variables(matrix_a_filename, matrix_comm); alloc_submatrix_buffer(&matrix_a); alloc_submatrix_buffer(&matrix_b); alloc_submatrix_buffer(&matrix_c); distribute_matrix(matrix_a_filename, matrix_a, matrix_comm); distribute_matrix(matrix_b_filename, matrix_b, matrix_comm); /* The actual cannon algorithms */ int row_source, row_dst; int col_source, col_dst; MPI_Cart_shift(matrix_comm, 0, -1, &row_source, &row_dst); MPI_Cart_shift(matrix_comm, 1, -1, &col_source, &col_dst); int i; for (i = 0; i < pp_dims; i++) { compute_matrix_mul(matrix_a, matrix_b, matrix_c, N); MPI_Sendrecv_replace(matrix_a, sub_n * sub_n, MPI_INT, row_source, MPI_ANY_TAG, row_dst, MPI_ANY_TAG, matrix_comm, MPI_STATUS_IGNORE); MPI_Sendrecv_replace(matrix_b, sub_n * sub_n, MPI_INT, col_source, MPI_ANY_TAG, col_dst, MPI_ANY_TAG, matrix_comm, MPI_STATUS_IGNORE); } write_result(matrix_c_filename, matrix_c, matrix_comm); free(matrix_a); free(matrix_b); free(matrix_c); MPI_Comm_free(&matrix_comm); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int i,j, num_procs, my_rank, ptr_a=0, ptr_b=0; int my_2drank, up_rank, down_rank, left_rank, right_rank, shift_source, shift_dest; int procs_dim[2], my_coords[2], periods[2], start[2]= {0,0}; int *displs_a, *displs_b, *sendCounts, local_size_a[2][2], local_size_b[2][2]; int alloc_row_a, alloc_col_a, alloc_row_b, alloc_col_b; int global_size_a[2],global_size_b[2], local_size_c[2]; double **a_buffers[2], **b_buffers[2], **a_global, **b_global, **c_global, **c_local; double *matrix_a_ptr=NULL, *matrix_b_ptr=NULL, *matrix_c_ptr=NULL; MPI_Status status; MPI_Comm comm_2d; MPI_Request reqs[4]; //MPI_struct datatypes MPI_Init (&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &num_procs); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); //equal number of processes in x and y dimension procs_dim[0] = procs_dim[1] = sqrt(num_procs); //Setting wraparound to true that is periods different than 0 periods[0] = periods[1] = 1; /*Create a new topology, setting reorder to true to get a new rank in 2d communicator*/ MPI_Cart_create(MPI_COMM_WORLD, 2, procs_dim, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &my_2drank); MPI_Cart_coords(comm_2d, my_2drank, 2, my_coords); //Get the ranks to the left processes and above MPI_Cart_shift(comm_2d, 1, -1, &right_rank, &left_rank); MPI_Cart_shift(comm_2d, 0, -1, &down_rank, &up_rank); if (my_rank == 0) { read_matrix_binaryformat((char *)argv[1], &a_global, &global_size_a[0], &global_size_a[1]); read_matrix_binaryformat((char *)argv[2], &b_global, &global_size_b[0], &global_size_b[1]); } MPI_Bcast (&global_size_a[0], 1, MPI_INT, 0, comm_2d); MPI_Bcast (&global_size_a[1], 1, MPI_INT, 0, comm_2d); MPI_Bcast (&global_size_b[0], 1, MPI_INT, 0, comm_2d); MPI_Bcast (&global_size_b[1], 1, MPI_INT, 0, comm_2d); local_size_a[1][0] = local_size_c[0] = (my_coords[0] < procs_dim[0]-1) ? global_size_a[0]/procs_dim[0] : global_size_a[0]/procs_dim[0] + (global_size_a[0]%procs_dim[0]); local_size_a[1][1]= (my_coords[1] < procs_dim[1]-1) ? global_size_a[1]/procs_dim[1] : global_size_a[1]/procs_dim[1]+(global_size_a[1]%procs_dim[1]); local_size_b[1][1] = local_size_c[1] = (my_coords[1] < procs_dim[1]-1) ? global_size_a[0]/procs_dim[1] : global_size_a[0]/procs_dim[1] + global_size_a[0]%procs_dim[1]; local_size_b[1][0] = (my_coords[0] < procs_dim[1]-1) ? global_size_b[0]/procs_dim[1] : global_size_b[0]/procs_dim[0] + (global_size_b[0]%procs_dim[0]); //printf("rank : %d i : %d j : %d\n",my_2drank,my_coords[0],my_coords[1]); //printf("rank : %d c0 : %d c1 : %d\n",my_2drank, local_size_c[0], local_size_c[1]); //printf("rank : %d a00 : %d a01 : %d b10 : %d b11 : %d c0 : %d c1 : %d\n",my_2drank, local_size_a[1][0], local_size_a[1][1],local_size_b[1][0], local_size_b[1][1], local_size_c[0], local_size_c[1]); allocate_matrix(&a_buffers[1], local_size_a[1][0], local_size_a[1][1]); allocate_matrix(&b_buffers[1], local_size_b[1][0], local_size_b[1][1]); allocate_matrix(&c_local, local_size_c[0], local_size_c[1]); if (my_rank == 0) { //The pointers are different than NULL only at root matrix_a_ptr = &(a_global[0][0]); matrix_b_ptr = &(b_global[0][0]); sendCounts = (int *)malloc(num_procs*sizeof(int)); displs_a = (int *)malloc(num_procs*sizeof(int)); displs_b = (int *)malloc(num_procs*sizeof(int)); for(i = 0; i < procs_dim[0]; i++) { for(j = 0; j < procs_dim[1]; j++) { sendCounts[i*procs_dim[0]+j] = 1; displs_a[i*procs_dim[0]+j] = ptr_a; displs_b[i*procs_dim[0]+j] = ptr_b; ptr_a += 1; ptr_b += 1; } ptr_a += procs_dim[1]*(global_size_a[0]/procs_dim[0] - 1); ptr_b += procs_dim[1]*(global_size_b[0]/procs_dim[1] - 1); } } distribute_matrix(a_buffers[1], matrix_a_ptr, sendCounts, displs_a, global_size_a, local_size_a[1]); distribute_matrix(b_buffers[1], matrix_b_ptr, sendCounts, displs_b, global_size_b, local_size_b[1]); MPI_Cart_shift(comm_2d, 1, -my_coords[0], &shift_source, &shift_dest); MPI_Sendrecv(local_size_a[1], 2, MPI_INT, shift_dest, 1, local_size_a[0], 2, MPI_INT, shift_source, 1, comm_2d, &status); allocate_matrix(&a_buffers[0], local_size_a[0][0], local_size_a[0][1]); MPI_Sendrecv(&(a_buffers[1][0][0]), local_size_a[1][0]*local_size_a[1][1], MPI_DOUBLE, shift_dest, 1, &(a_buffers[0][0][0]), local_size_a[0][0]*local_size_a[0][1], MPI_DOUBLE, shift_source, 1, comm_2d, &status); MPI_Cart_shift(comm_2d, 0, -my_coords[1], &shift_source, &shift_dest); printf("rank : %d ss : %d sd : %d\n",my_2drank, shift_source, shift_dest); MPI_Sendrecv(local_size_b[1], 2, MPI_INT, shift_dest, 1, local_size_b[0], 2, MPI_INT, shift_source, 1, comm_2d, &status); allocate_matrix(&b_buffers[0], local_size_b[0][0], local_size_b[0][1]); MPI_Sendrecv(&(b_buffers[1][0][0]), local_size_b[1][0]*local_size_b[1][1], MPI_DOUBLE, shift_dest, 1, &(b_buffers[0][0][0]), local_size_b[0][0]*local_size_b[0][1], MPI_DOUBLE, shift_source, 1, comm_2d, &status); for (i=0; i<procs_dim[0]; i++) { //Sending and receiving the new submatrix size from the right and bottom and if different deallocate and allocate again. MPI_Sendrecv(local_size_a[i%2], 2, MPI_INT, left_rank, 1, local_size_a[(i+1)%2], 2, MPI_INT, right_rank, 1, comm_2d, &status); deallocate_matrix(&a_buffers[(i+1)%2]); allocate_matrix(&a_buffers[(i+1)%2],local_size_a[(i+1)%2][0], local_size_a[(i+1)%2][1]); MPI_Sendrecv(local_size_b[i%2], 2, MPI_INT, up_rank, 1, local_size_b[(i+1)%2], 2, MPI_INT, down_rank, 1, comm_2d, &status); deallocate_matrix(&b_buffers[(i+1)%2]); allocate_matrix(&b_buffers[(i+1)%2],local_size_b[(i+1)%2][0], local_size_b[(i+1)%2][1]); MPI_Isend(&(a_buffers[i%2][0][0]), local_size_a[i%2][0]*local_size_a[i%2][1], MPI_DOUBLE, left_rank, 1, comm_2d, &reqs[0]); MPI_Isend(&(b_buffers[i%2][0][0]), local_size_b[i%2][0]*local_size_b[i%2][1], MPI_DOUBLE, up_rank, 1, comm_2d, &reqs[1]); MPI_Irecv(&(a_buffers[(i+1)%2][0][0]), local_size_a[(i+1)%2][0]*local_size_a[(i+1)%2][1], MPI_DOUBLE, right_rank, 1 ,comm_2d, &reqs[2]); MPI_Irecv(&(b_buffers[(i+1)%2][0][0]), local_size_b[(i+1)%2][0]*local_size_b[(i+1)%2][1], MPI_DOUBLE, down_rank, 1 ,comm_2d, &reqs[3]); matrix_multiply(local_size_c, local_size_a[i%2][1], a_buffers[i%2], b_buffers[i%2], c_local); for(j=0; j<4; j++) MPI_Wait(&reqs[j],&status); } if (my_rank == 0) { deallocate_matrix(&a_global); deallocate_matrix(&b_global); allocate_matrix(&c_global, global_size_a[0], global_size_b[1]); matrix_c_ptr = &(c_global[0][0]); } gather_submatrices(c_local, matrix_c_ptr, sendCounts, displs_a, global_size_a[0], local_size_c, my_2drank); deallocate_matrix(&a_buffers[0]); deallocate_matrix(&a_buffers[1]); deallocate_matrix(&b_buffers[0]); deallocate_matrix(&b_buffers[1]); deallocate_matrix(&c_local); MPI_Comm_free(&comm_2d); MPI_Finalize (); return 0; }
int main(int argc, char** argv) { int rank, size; int N; char opt; int nt = -1; int max_threads = 16; // on jupiter bool id = false; algo_t algo = reduce_scatter; FILE *f = NULL; static const char optstring[] = "n:a:f:i:p:"; static const struct option long_options[] = { {"n", 1, NULL, 'n'}, {"file", 1, NULL, 'f'}, {"i", 1, NULL, 'i'}, {NULL, 0, NULL, 0} }; MPI_Init(&argc,&argv); // get rank and size from communicator MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); while ((opt = getopt_long(argc, argv, optstring, long_options, NULL)) != EOF) { switch(opt) { case 'i': if (strcmp("procs", optarg) == 0) { id = true; } break; case 'p': nt = atoi(optarg); if (nt > max_threads) { printf("Using too much procs %d, use max %d", nt, max_threads); return EXIT_FAILURE; } else { printf("Using %d procs.", nt); } case 'n': N = atoi(optarg); break; case 'f': f = fopen(optarg,"a"); if (f == NULL) { mpi_printf(root, "Could not open log file '%s': %s\n", optarg, strerror(errno)); MPI_Finalize(); return EXIT_FAILURE; } break; case 'a': if (strcmp("ref", optarg) == 0) { mpi_printf(root, "Using reference implementation \n"); algo = ref; } else if ((strcmp("reduce_scatter", optarg) == 0)) { mpi_printf(root, "Using MPI_Allgather implementation \n"); algo = reduce_scatter; } break; default: MPI_Finalize(); return EXIT_FAILURE; } } if(N == 0) { if ( rank == root ){ printf("Usage: mpirun -nn nodecount p3-reduce_scatter.exe -n N\n"); printf("N is the the matrix size. \n\n"); } return 1; } /* ======================================================== */ /* Initialisation matrix & vector */ ATYPE *matrix = NULL; ATYPE *vector = NULL; if (rank == root) { debug("Setting up root data structures"); matrix = init_matrix(N,1); vector = init_vector(N,1); } int colcnt = N - (N/size ) * (size - 1 ); int partition = N/size; ATYPE *local_matrix = NULL; local_matrix = (ATYPE*) malloc (sizeof(ATYPE) * N * colcnt); ATYPE *local_vector = NULL; local_vector = (ATYPE*) malloc (sizeof(ATYPE) * partition) ; ATYPE *reference = NULL; reference = init_vector(N,1); ATYPE *result = NULL; result = init_vector(N,1); double inittime,totaltime; if( algo == ref) { if (rank == root) { inittime = MPI_Wtime(); matrix_vector_mult_ref(matrix, vector, N, reference); totaltime = MPI_Wtime() - inittime; } } else if (algo == reduce_scatter) { if(rank == root){ debug("Comptuting reference"); matrix_vector_mult_ref(matrix, vector, N, reference); } MPI_Barrier(MPI_COMM_WORLD); /* ======================================================== */ /* distributing matrix and vector */ distribute_vector(vector, local_vector, rank, size, partition, N); distribute_matrix(matrix, local_matrix, rank, size, partition, N); debug("begin MPI_Reduce_scatter"); MPI_Barrier(MPI_COMM_WORLD); inittime = MPI_Wtime(); compute_reduce_scatter(local_matrix, local_vector, result, rank, size, N, partition); MPI_Barrier(MPI_COMM_WORLD); totaltime = MPI_Wtime() - inittime; double localtime = totaltime; MPI_Reduce(&localtime, &totaltime, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); debug("after MPI_Reduce_scatter"); /* TODO: fix test so it uses vector idea */ /* debug("Testing result"); */ /* if (test_vector_part(result, local_vector, (rank * partition) , partition)) { */ /* debug("testresult: OK"); */ /* } else { */ /* debug("testresult: FAILURE"); */ /* debug("Result:"); */ /* printArray(recvbuff, N); */ /* debug("Reference:"); */ /* printArray(reference,N); */ /* } */ MPI_Barrier(MPI_COMM_WORLD); } if (rank == 0) { if (f != NULL) { if (id) { fprintf(f,"%d,%lf\n",nt, totaltime); } else { fprintf(f,"%d,%lf\n",N, totaltime); } } if (id) { printf("%d,%lf\n",nt , totaltime); } else { printf("%d,%lf\n",N , totaltime); } } debug("cleaning up"); free(vector); free(matrix); MPI_Finalize(); if ( f != NULL) { fclose(f); } return 0; }
/** * Creates random A, B, and C matrices and uses summa() to * calculate the product. Output of summa() is compared * to CC, the true solution. **/ bool random_matrix_test(int m, int n, int k, int px, int py, int panel_size) { int proc = 0, passed_test = 0, group_passed = 0; int num_procs = px * py; int rank = 0; double *A, *B, *C, *CC, *A_block, *B_block, *C_block, *CC_block; A = NULL; B = NULL; C = NULL; CC = NULL; MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* Get process id */ if (rank == 0) { /* Allocate matrices */ A = random_matrix(m, k); B = random_matrix(k, n); C = zeros_matrix(m, n); /* Stores the solution */ CC = zeros_matrix(m, n); /* * Solve the problem locally and store the * solution in CC */ local_mm(m, n, k, 1.0, A, m, B, k, 0.0, CC, m); } /* * Allocate memory for matrix blocks */ A_block = malloc(sizeof(double) * (m * k) / num_procs); assert(A_block); B_block = malloc(sizeof(double) * (k * n) / num_procs); assert(B_block); C_block = malloc(sizeof(double) * (m * n) / num_procs); assert(C_block); CC_block = malloc(sizeof(double) * (m * n) / num_procs); assert(CC_block); /* Distrute the matrices */ distribute_matrix(px, py, m, k, A, A_block, rank); distribute_matrix(px, py, k, n, B, B_block, rank); distribute_matrix(px, py, m, n, C, C_block, rank); distribute_matrix(px, py, m, n, CC, CC_block, rank); if (rank == 0) { /* * blocks of A, B, C, and CC have been distributed to * each of the processes, now we can safely deallocate the * matrices */ deallocate_matrix(A); deallocate_matrix(B); deallocate_matrix(C); deallocate_matrix(CC); } #ifdef DEBUG /* flush output and synchronize the processes */ fflush(stdout); sleep(1); MPI_Barrier( MPI_COMM_WORLD); #endif /* * * Call SUMMA * */ summa(m, n, k, A_block, B_block, C_block, px, py, 1); #ifdef DEBUG /* flush output and synchronize the processes */ fflush(stdout); sleep(1); MPI_Barrier( MPI_COMM_WORLD); #endif #ifdef DEBUG /* Verify each C_block sequentially */ for (proc=0; proc < num_procs; proc++) { if (rank == proc) { bool isCorrect = verify_matrix_bool(m / px, n / py, C_block, CC_block); if (isCorrect) { printf("CBlock on rank=%d is correct\n",rank); fflush(stdout); } else { printf("**\tCBlock on rank=%d is wrong\n",rank); printf("CBlock on rank=%d is\n",rank); print_matrix(m / px, n / py, C_block); printf("CBlock on rank=%d should be\n",rank); print_matrix(m / px, n / py, CC_block); printf("**\n\n"); fflush(stdout); passed_test = 1; sleep(1); } } MPI_Barrier(MPI_COMM_WORLD); /* keep all processes synchronized */ }/* proc */ #else /* each process will verify its C_block in parallel */ if (verify_matrix_bool(m / px, n / py, C_block, CC_block) == false) { passed_test = 1; } #endif /* free A_block, B_block, C_block, and CC_block */ free(A_block); free(B_block); free(C_block); free(CC_block); /* * * passed_test == 0 if the process PASSED the test * passed_test == 1 if the process FAILED the test * * therefore a MPI_Reduce of passed_test will count the * number of processes that failed * * After the MPI_Reduce/MPI_Scatter, if group_passed == 0 then every process passed */ MPI_Reduce(&passed_test, &group_passed, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Bcast(&group_passed, 1, MPI_INT, 0, MPI_COMM_WORLD); if (rank == 0 && group_passed == 0) { printf( "random_matrix_test m=%d n=%d k=%d px=%d py=%d pb=%d............PASSED\n", m, n, k, px, py, panel_size); } if (rank == 0 && group_passed != 0) { printf( "random_matrix_test m=%d n=%d k=%d px=%d py=%d pb=%d............FAILED\n", m, n, k, px, py, panel_size); } /* If group_passed==0 then every process passed the test*/ if (group_passed == 0) { return true; } else { return false; } }