unsigned int gol(unsigned char *grid, unsigned int dim_x, unsigned int dim_y, unsigned int time_steps) { // READ ME! Parallelize this function to work with MPI. It must work even with a single processor. // We expect you to use MPI_Scatterv, MPI_Gatherv, and MPI_Sendrecv to achieve this. // MPI_Scatterv/Gatherv are checked to equal np times, and MPI_Sendrecv is expected to equal 2 * np * timesteps // That is, top+bottom ghost cells * all processors must execute this command * Sendrecv executed every timestep. int np, rank, quo, rem; MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Allocate length and displacements array quo = (dim_y/np)*dim_x; rem = (dim_y%np)*dim_x; int *length = (int *) calloc(np, sizeof(int)); int *disps = (int *) calloc(np, sizeof(int)); // Fill corresponding arrays for(int i = 0; i < np - 1; i++) { disps[i + 1] = disps[i] + quo; length[i] = disps[i + 1] - disps[i]; } length[np - 1] = quo + rem; // Grids allocation unsigned char *loc_grid_in, *loc_grid_tmp, *loc_grid_out; loc_grid_in = (unsigned char *) calloc(sizeof(unsigned char), length[rank] + 2*dim_x); loc_grid_tmp = (unsigned char *) calloc(sizeof(unsigned char), length[rank] + 2*dim_x); if (loc_grid_tmp == NULL) exit(EXIT_FAILURE); // Distribute parts of grid to other processors MPI_Scatterv(grid, length, disps, MPI_UNSIGNED_CHAR, loc_grid_in + dim_x, length[rank], MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); loc_grid_out = loc_grid_tmp; int loc_dim_y = length[rank]/dim_x; int frw = (rank + 1 + np) % np; int backw = (rank - 1 + np) % np; for (int t = 0; t < time_steps; ++t) { // Forward sendrecv MPI_Sendrecv(loc_grid_in + length[rank], dim_x, MPI_UNSIGNED_CHAR, frw, 1, loc_grid_in, dim_x, MPI_UNSIGNED_CHAR, backw, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Backward sendrecv MPI_Sendrecv(loc_grid_in + dim_x, dim_x, MPI_UNSIGNED_CHAR, backw, 0, loc_grid_in + dim_x + length[rank], dim_x, MPI_UNSIGNED_CHAR, frw, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); for (int y = 1; y < 1 + loc_dim_y; ++y) { for (int x = 0; x < dim_x; ++x) { evolve(loc_grid_in, loc_grid_out, dim_x, loc_dim_y + 2, x, y); } } swap((void**)&loc_grid_in, (void**)&loc_grid_out); } MPI_Gatherv(loc_grid_in + dim_x, length[rank], MPI_UNSIGNED_CHAR, grid, length, disps, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); free(loc_grid_in); free(loc_grid_out); free(disps); free(length); if (rank == 0) return cells_alive(grid, dim_x, dim_y); else return 0; }
main() { printf("Program start\n"); FILE *fp; clock_t begin,end; double time_spent; int NPROC, rank, root, colindex, link,i,j=0, k, col,colmatch=0,localsum=0; int newlines = 0, linenum = 5; char ch; char line[21]; double * val = (double*)calloc(EDGES, sizeof(double)); // double val[EDGES]; int * rowind =(int*)calloc(EDGES, sizeof(int)); // int rowind[EDGES]; int *sendcnts; int *displs; int * colptr = (int*)calloc(NODES+1, sizeof(int)); //missing values will be initialized to zero? no.of cols+1 int co, index; //for normalizing the array of non zero elements double * pr = (double*)malloc(NODES*sizeof(double));; //malloc and initialize to 0.25 double * prnew = (double*)calloc(NODES, sizeof(double)); double * damp1 = (double*)malloc(NODES*sizeof(double)); //malloc and initialize to 0.85 double * damp2 = (double*)malloc(NODES*sizeof(double)); //malloc and initialize to 0.15/NODES double * diff = (double*)calloc(NODES, sizeof(double)); double * sum = (double*)calloc(NODES, sizeof(double)); //this is for the column vectors double * rec_val = (double*)malloc(100000*sizeof(double)); double * rec_pr = (double*)malloc(100000*sizeof(double)); // double rec_val[4000], rec_pr[4000]; //these receive the scattered vector parts in the processes double err = 0.00001; double norm, norm_sq; int * readsum = (int*)calloc(NODES, sizeof(int)); int rec_col; int * rec_row = (int*)malloc(100000*sizeof(int)); MPI_Init(NULL, NULL); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &NPROC); printf("Number of processes %d\n", NPROC); sendcnts= malloc(sizeof(int)*NPROC); displs = malloc(sizeof(int)*NPROC); int * pcols = (int*)malloc(NPROC*sizeof(int)); //No.of columns to each processor // int pcols[NPROC]; int * displs_pr = (int*)malloc(NPROC*sizeof(int));// displacement for pagerank vector scatter for(i=0; i<NODES;i++) { pr[i] = 0.25; damp1[i] = 0.85; damp2[i] = 0.15/NODES; } printf("initialization complete\n"); printf(" I am rank %d\n", rank); //1. Populate val, rowind, colptr //2. Calculate number of columns to each process //3. Number of pagerank vector elements to each process //4. Distribution of non zero elements to corresponding processes fp = fopen("data1.dat", "r"); while((ch=getc(fp)) != EOF) { if(ch == '\n') { newlines += 1; if(newlines == linenum - 1) { break; } } } for(i = 0; i<EDGES; i++) { fscanf(fp, "%d %d", &colindex, &link); //Uncomment the next two lines if node numbers does not start with zero colindex = colindex - 1; link = link - 1; rowind[i] = link; if(colmatch==colindex) { localsum += 1; } else { readsum[j] = localsum; colptr[j+1] = colptr[j] + localsum; //index of val where new column starts localsum = 1; //new localsum j += 1; colmatch = colindex; //new column } val[i] = 1.0; } readsum[j] = localsum; //for the last column colptr[j+1]= EDGES; //number of non zeros in the matrix fclose(fp); index = 0; for(i = 0; i<NODES; i++) { // This is to normalize the array of non zeros co = readsum[i]; for(j = index; j < index+co; j++) { val[j] = val[j]/co; } index += co; } printf("val, rowind and colptr have been populated\n"); // val, rowind, colptr calculation complete... all the above should // go to mpi file.. // Calculate number of columns to each process for(i=0; i<NPROC; i++) { if(i==0) { pcols[i] = NODES/NPROC + NODES%NPROC; displs_pr[i] = 0; }else { pcols[i] = NODES/NPROC; displs_pr[i] = pcols[i-1] + displs_pr[i-1]; } } // Calculating sendcnts and displs j = 0; for(i=0; i<NPROC; i++) { j = j + pcols[i]; k = j - pcols[i]; sendcnts[i] = colptr[j] - colptr[k]; if (i==0) { displs[i] = 0; }else displs[i] = sendcnts[i-1] + displs[i-1]; } if(rank==MASTER) { printf("This is MASTER\n"); printf("\nsendcnts\n"); for(i=0;i<NPROC;i++) { printf("%d\t", sendcnts[i]); } printf("\ndispls\n"); for(i=0;i<NPROC;i++) { printf("%d\t", displs[i]); } printf("\nval\n"); // for(i=0;i<EDGES; i++) { // printf("%f\t", val[i]); // } } // double val[] = {0.25,0.25,0.25,0.25,0.5,0.5,1.0,0.5,0.5}; // int sendcnts[] = {6,3}; // int displs[] = {0,6}; MPI_Scatterv(val, sendcnts, displs, MPI_DOUBLE, rec_val, sendcnts[rank], MPI_DOUBLE, 0, MPI_COMM_WORLD);//non-zero elements printf("first scatterv completed\n"); MPI_Scatter(pcols, 1, MPI_INT, &rec_col, 1, MPI_INT, 0, MPI_COMM_WORLD);//number of columns in each processor printf("scatter completed\n"); MPI_Scatterv(rowind, sendcnts, displs, MPI_INT, rec_row, sendcnts[rank], MPI_INT, 0, MPI_COMM_WORLD);//rowindices printf("second scatterv completed\n"); double *vec[rec_col];// Initializing vector columns for(i=0; i<rec_col; i++) { vec[i] = (double *)calloc(NODES,sizeof(double)); } k=0; for(j=0;j<rec_col;j++) {// Splitting rec_val to columnvectors for(i=0;i<NODES;i++) { if(i==rec_row[k]) { vec[j][i] = rec_val[k];//these vectors don't change with iterations k+=1; } } } begin = clock(); do //Only MASTER contains updated prnew and pr.. each process has its own sum { // norm is calculated by MASTER but is distributed to all for loop continuity memset(sum, 0, NODES*sizeof(double)); if(rank == MASTER) { memset(prnew, 0, NODES*sizeof(double)); } norm = 0.0; //Scatter and multiply MPI_Scatterv(pr, pcols, displs_pr, MPI_DOUBLE, rec_pr, pcols[rank], MPI_DOUBLE, 0, MPI_COMM_WORLD); for(i=0;i<NODES;i++) { for(j=0;j<rec_col;j++) { sum[i] += vec[j][i]*rec_pr[j];// sum of column multiplications } } MPI_Reduce(sum, prnew, NODES, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);//vector sums into master // Normalizing if(rank == MASTER) { for(i=0;i<NODES;i++) { prnew[i]= prnew[i]*damp1[i] + damp2[i]; } norm_sq = 0.0; for(i=0; i<NODES; i++) {// for norm calculation diff[i] = prnew[i] - pr[i]; norm_sq += diff[i]*diff[i]; pr[i] = prnew[i]; } norm = sqrt(norm_sq); } MPI_Bcast(&norm, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD); //if(rank==MASTER) { //printf("Reduced page rank vector:\n"); //for(i=0;i<NODES;i++) { // printf("%f\t", prnew[i]); //} }while(norm>err); end = clock(); time_spent = (double)(end-begin)/CLOCKS_PER_SEC; if(rank==MASTER) { printf("\npagerank vector first ten elements\n"); for(i = 0; i<10; i++) { printf("%f\t", prnew[i]); } printf("\n"); printf("Time taken for power iteration solution %fseconds\n",time_spent); } MPI_Finalize(); }
int main (int argc, char ** argv) { int taskid, ntasks; int xsize, ysize, colmax; pixel src[MAX_PIXELS]; double w[MAX_RAD]; struct timespec stime, etime; struct timespec tstime, tetime; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &taskid); MPI_Comm_size(MPI_COMM_WORLD, &ntasks); // Create a custom MPI datatype for pixel pixel item; MPI_Datatype pixel_mpi; MPI_Datatype type[3] = { MPI_UNSIGNED_CHAR, MPI_UNSIGNED_CHAR, MPI_UNSIGNED_CHAR }; int blocklen[] = { 1, 1, 1 }; MPI_Aint start, disp[3]; MPI_Address( &item, &start ); MPI_Address( &item.r, &disp[0] ); MPI_Address( &item.g, &disp[1] ); MPI_Address( &item.b, &disp[2] ); disp[0] -= start; disp[1] -= start; disp[2] -= start; MPI_Type_struct(3, blocklen, disp, type, &pixel_mpi); MPI_Type_commit(&pixel_mpi); int buffsize, radius, startY, endY; /* Take care of the arguments */ if (argc != 4) { fprintf(stderr, "Usage: %s radius infile outfile\n", argv[0]); exit(1); } radius = atoi(argv[1]); if((radius > MAX_RAD) || (radius < 1)) { fprintf(stderr, "Radius (%d) must be greater than zero and less then %d\n", radius, MAX_RAD); exit(1); } if (taskid == ROOT) { /* read file */ if(read_ppm (argv[2], &xsize, &ysize, &colmax, (char *) src) != 0) exit(1); if (colmax > 255) { fprintf(stderr, "Too large maximum color-component value\n"); exit(1); } /* filter */ printf("Has read the image, generating coefficients\n"); get_gauss_weights(radius, w); } // Broadcast the gaussian weight vector MPI_Bcast(w, MAX_RAD, MPI_DOUBLE, ROOT, MPI_COMM_WORLD); // Broadcast image dimensions MPI_Bcast(&xsize, 1, MPI_INT, ROOT, MPI_COMM_WORLD); MPI_Bcast(&ysize, 1, MPI_INT, ROOT, MPI_COMM_WORLD); // Calculate chunk size buffsize = ceil((float)ysize / (float)ntasks) * xsize; pixel recvbuff[MAX_PIXELS]; int sendcnts[ntasks], displs[ntasks], result_write_starts[ntasks], recievecounts[ntasks]; int i; // Generate sendcount and displacement vectors for Scatterv for (i = 0; i < ntasks; i++) { // Send enought neighbors to make it possible to also calculate // blur in the edges of the chunk sendcnts[i] = buffsize + 2 * radius * xsize; displs[i] = max(0, i * buffsize); } clock_gettime(CLOCK_REALTIME, &tstime); // Send the image in chunks to all nodes MPI_Scatterv(src, sendcnts, displs, pixel_mpi, recvbuff, buffsize + 2 * radius * xsize, pixel_mpi, ROOT, MPI_COMM_WORLD); clock_gettime(CLOCK_REALTIME, &stime); // Run the filter on the recieved chunk blurfilter(xsize, (ysize / ntasks) + 2 * radius, recvbuff, radius, w, taskid); clock_gettime(CLOCK_REALTIME, &etime); printf("Filtering at %i took: %g secs\n", taskid, (etime.tv_sec - stime.tv_sec) + 1e-9*(etime.tv_nsec - stime.tv_nsec)); // Generate sendcount and displacement vectors for Scatterv for (i = 0; i < ntasks; i++) { result_write_starts[i] = i * buffsize + xsize * radius; // Only send as much of the chunk that is really useful data recievecounts[i] = buffsize; } // Start writing from the beginning of the buffer if root result_write_starts[0] = 0; // Since the root node has no overlap in the beginning, we need to // send a little bit more from that node than from the rest. recievecounts[0] = buffsize + xsize * radius; pixel* result_read_start; if(taskid==ROOT) { // Root-node has no duplicated data in the beginning result_read_start = recvbuff; } else { // Jump over the duplicated data in the beginning of each chunk result_read_start = recvbuff + xsize * radius; } MPI_Gatherv(result_read_start, recievecounts[taskid], pixel_mpi, src, recievecounts, result_write_starts, pixel_mpi, ROOT, MPI_COMM_WORLD); clock_gettime(CLOCK_REALTIME, &tetime); MPI_Finalize(); /* write result */ if (taskid == ROOT) { printf("Everything took: %g secs\n", (tetime.tv_sec - tstime.tv_sec) + 1e-9*(tetime.tv_nsec - tstime.tv_nsec)); printf("Writing output file\n"); if(write_ppm (argv[3], xsize, ysize, (char *)src) != 0) exit(1); } return(0); }
/* Gather or scatter the global base array between processes. * NB: this is a collective operation. * * @scatter If true we scatter else we gather * @global_ary Global base array */ static void comm_gather_scatter(int scatter, bh_base *global_ary) { bh_error err; bh_base *local_ary = array_get_local(global_ary); bh_intp totalsize = global_ary->nelem; if(totalsize <= 0) return; //Find the local size for all processes int sendcnts[pgrid_worldsize], displs[pgrid_worldsize]; { bh_intp s = totalsize / pgrid_worldsize;//local size for all but the last process s *= bh_type_size(global_ary->type); for(int i=0; i<pgrid_worldsize; ++i) { sendcnts[i] = s; displs[i] = s * i; } //The last process gets the rest sendcnts[pgrid_worldsize-1] += totalsize % pgrid_worldsize * bh_type_size(global_ary->type); } int e; if(scatter) { //The slave-processes may need to allocate memory if(sendcnts[pgrid_myrank] > 0 && local_ary->data == NULL) { if((err = bh_data_malloc(local_ary)) != BH_SUCCESS) EXCEPT_OUT_OF_MEMORY(); } //The master-process MUST have allocated memory already assert(pgrid_myrank != 0 || global_ary->data != NULL); //Scatter from master to slaves e = MPI_Scatterv(global_ary->data, sendcnts, displs, MPI_BYTE, local_ary->data, sendcnts[pgrid_myrank], MPI_BYTE, 0, MPI_COMM_WORLD); } else { //Lets make sure that the 'local_ary' is updated batch_schedule_inst_on_base(BH_SYNC, local_ary); batch_flush(); //The master-processes may need to allocate memory if(pgrid_myrank == 0 && global_ary->data == NULL) { if((err = bh_data_malloc(global_ary)) != BH_SUCCESS) EXCEPT_OUT_OF_MEMORY(); } //We will always allocate the local array when gathering because //only the last process knows if the array has been initiated. if((err = bh_data_malloc(local_ary)) != BH_SUCCESS) EXCEPT_OUT_OF_MEMORY(); assert(sendcnts[pgrid_myrank] == 0 || local_ary->data != NULL); //Gather from the slaves to the master e = MPI_Gatherv(local_ary->data, sendcnts[pgrid_myrank], MPI_BYTE, global_ary->data, sendcnts, displs, MPI_BYTE, 0, MPI_COMM_WORLD); } if(e != MPI_SUCCESS) EXCEPT_MPI(e); }
int main(int argc, char *argv[]){ int my_rank, procs, tag=0; uint64_t nodes = pow(2,SCALE); uint64_t edges = nodes*EDGEFACTOR; uint64_t root = ROOT; MPI_Status status; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &my_rank); MPI_Comm_size (MPI_COMM_WORLD, &procs); //SHOULD BE POWER OF TWO uint64_t *startVertex = NULL; uint64_t *endVertex = NULL; /* MUST BE INT BECAUSE OF MPI RESTRICTION */ int *edgelist_send_counts = NULL; int *edgelist_send_displs = NULL; uint64_t *startVertex_recvbuf = NULL; uint64_t *endVertex_recvbuf = NULL; uint64_t *index_of_node = NULL; uint64_t *level = (uint64_t *) calloc(nodes / BITS, sizeof(uint64_t)); int edgelist_counts_recvbuf = 0; if (my_rank == 0){ startVertex = (uint64_t *) calloc(edges, I64_BYTES); endVertex = (uint64_t *) calloc(edges, I64_BYTES); edgelist_send_counts = (int *) calloc(procs, sizeof(int)); edgelist_send_displs = (int *) calloc(procs, sizeof(int)); read_graph(SCALE, EDGEFACTOR, startVertex, endVertex); double time = mytime(); //SORTING THE EDGE LIST sort(startVertex, endVertex, 0, edges-1); //FINDING OUT THE BOUNDS OF THE EDGE LIST FOR EACH PROC int j; int last_node_number = 0; int core_count = 0; for (j = 0; j < procs; j++){ last_node_number = nodes / procs * (j+1) - 1; core_count = (edges / procs * (j+1)) - 1; if (j < procs -1){ while (startVertex[core_count] <= last_node_number) { core_count++; } while (startVertex[core_count] > last_node_number){ core_count--; } if (j){ edgelist_send_counts[j] = core_count - edgelist_send_displs[j] + 1; }else{ edgelist_send_counts[j] = core_count + 1; } edgelist_send_displs[j+1] = core_count + 1; }else{ edgelist_send_displs[0] = 0; edgelist_send_counts[j] = edges - edgelist_send_displs[j]; } } MPI_Scatter((void *) edgelist_send_counts, 1, MPI_INT, &edgelist_counts_recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); startVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES); endVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES); MPI_Scatterv((void *) startVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) startVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD); MPI_Scatterv((void *) endVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) endVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD); index_of_node = create_buffer_from_edgelist(startVertex_recvbuf, endVertex_recvbuf, nodes / procs, edgelist_counts_recvbuf, my_rank); //SET ROOT LEVEL level[(ROOT/BITS)] = level[(ROOT/BITS)] | (uint64_t) pow(2,(ROOT % BITS)); //SCATTER LEVEL BUFFER MPI_Bcast((void *)level, nodes / BITS, MPI_UINT64_T, 0, MPI_COMM_WORLD); /*for (i = 0; i < index_of_node[(nodes/procs)]; i++){ printf("%llu = %llu\n", (unsigned long long) buffer_recvbuf[i], (unsigned long long) startVertex_recvbuf[i]); } for (i = 0; i < nodes / procs; i++){ printf("%llu = %llu\n", (unsigned long long) count_edges_per_node_recvbuf[i], (unsigned long long) index_of_node[i]); }*/ //BFS time = mytime() - time; printf("Time for reading, generating edge buffer and scattering: %f\n", time/1000000); time = mytime(); bfs(level, startVertex_recvbuf, index_of_node[nodes/procs], index_of_node, my_rank, procs); time = mytime() - time; printf("Time for bfs searching: %f\n", time/1000000); free(edgelist_send_counts); free(edgelist_send_displs); free(startVertex); free(endVertex); }else{ MPI_Scatter((void *) edgelist_send_counts, 1, MPI_INT, &edgelist_counts_recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD); startVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES); endVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES); MPI_Scatterv((void *) startVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) startVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD); MPI_Scatterv((void *) endVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) endVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD); index_of_node = create_buffer_from_edgelist(startVertex_recvbuf, endVertex_recvbuf, nodes / procs, edgelist_counts_recvbuf, my_rank); // GET THE FIRST LEVEL MPI_Bcast((void *)level, nodes / BITS, MPI_UINT64_T, 0, MPI_COMM_WORLD); bfs(level, startVertex_recvbuf, index_of_node[nodes/procs], index_of_node, my_rank, procs); } free(level); free(startVertex_recvbuf); free(endVertex_recvbuf); free(index_of_node); MPI_Finalize (); return 0; }
void IMB_scatterv(struct comm_info* c_info, int size, struct iter_schedule* ITERATIONS, MODES RUN_MODE, double* time) /* MPI-1 benchmark kernel Benchmarks MPI_Scatterv Input variables: -c_info (type struct comm_info*) Collection of all base data for MPI; see [1] for more information -size (type int) Basic message size in bytes -ITERATIONS (type struct iter_schedule *) Repetition scheduling -RUN_MODE (type MODES) (only MPI-2 case: see [1]) Output variables: -time (type double*) Timing result per sample */ { double t1, t2; int i; Type_Size s_size,r_size; int s_num, r_num; #ifdef CHECK defect=0.; #endif ierr = 0; /* GET SIZE OF DATA TYPE */ MPI_Type_size(c_info->s_data_type,&s_size); MPI_Type_size(c_info->r_data_type,&r_size); if ((s_size!=0) && (r_size!=0)) { s_num=size/s_size; r_num=size/r_size; } /* INITIALIZATION OF DISPLACEMENT and RECEIVE COUNTS */ for (i=0;i<c_info->num_procs ;i++) { c_info->sdispl[i] = s_num*i; c_info->sndcnt[i] = s_num; } if(c_info->rank!=-1) { for(i=0; i<N_BARR; i++) MPI_Barrier(c_info->communicator); t1 = MPI_Wtime(); for(i=0;i<ITERATIONS->n_sample;i++) { ierr = MPI_Scatterv((char*)c_info->s_buffer+i%ITERATIONS->s_cache_iter*ITERATIONS->s_offs, c_info->sndcnt,c_info->sdispl, c_info->s_data_type, (char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs, // root = round robin r_num, c_info->r_data_type, i%c_info->num_procs, c_info->communicator); MPI_ERRHAND(ierr); CHK_DIFF("Scatterv",c_info, (char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs, c_info->sdispl[c_info->rank], size, size, 1, put, 0, ITERATIONS->n_sample, i, i%c_info->num_procs, &defect); } t2 = MPI_Wtime(); *time=(t2 - t1)/ITERATIONS->n_sample; } else { *time = 0.; } }
/* Guassian Elimination algorithm using MPI */ void gaussElimination() { MPI_Status status; MPI_Request request; int row, col, i, norm; float multiplier; /* Array with the row size and number of rows that each processor will handle */ int * first_row_A_array = (int*) malloc ( p * sizeof(int) ); int * n_of_rows_A_array = (int*) malloc ( p * sizeof(int) ); int * first_row_B_array = (int*) malloc ( p * sizeof(int) ); int * n_of_rows_B_array = (int*) malloc ( p * sizeof(int) ); for ( i = 0; i < p; i++ ) { first_row_A_array[i] = 0; n_of_rows_A_array[i] = 0; first_row_B_array[i] = 0; n_of_rows_B_array[i] = 0; } /* Main loop. After every iteration, a new column will have all 0 values down the [norm] index */ for (norm = 0; norm < N-1; norm++) { /* --------------------------------------- */ /* Broadcasting of common values */ /* -------------------------------------- */ /* Broadcast the A[norm] row and B[norm], important values of this iteration */ MPI_Bcast( &A[ N*norm ], N, MPI_FLOAT, SOURCE, MPI_COMM_WORLD ); MPI_Bcast( &B[norm], 1, MPI_FLOAT, SOURCE, MPI_COMM_WORLD ); /* --------------------------------------- */ /* Calculation of number of rows to operate */ /* -------------------------------------- */ /* subset of rows of this iteration */ int subset = N - 1 - norm; /* number that indicates the step as a float */ float step = ((float)subset ) / (p); /* First and last rows that this process will work into for this iteration */ int first_row = norm + 1 + ceil( step * (my_rank) ); int last_row = norm + 1 + floor( step * (my_rank+1) ); if ( last_row >= N ) last_row = N-1; int number_of_rows = last_row - first_row +1; /*printf("\nProcess number %d of %d says in iteration %d that a=%d, b=%d and n=%d\n", my_rank+1, p, norm+1,first_row,last_row,number_of_rows) ;*/ /* --------------------------------------- */ /* Send data from process 0 to others */ /* -------------------------------------- */ if ( my_rank == SOURCE ) { for ( i = 1; i < p; i++ ) { /* We send to each process the amount of data that they are going to handle */ int first_row_rmte = norm + 1 + ceil( step * (i) ); int last_row_rmte = norm + 1 + floor( step * (i+1) ); if( last_row_rmte >= N ) last_row_rmte = N -1; int number_of_rows_rmte = last_row_rmte - first_row_rmte +1; /* In case this process isn't assigned any task, continue. This happens when there are more processors than rows */ //if( number_of_rows_rmte < 1 || first_row_rmte >= N ) continue; if ( number_of_rows_rmte < 0 ) number_of_rows_rmte = 0; if ( first_row_rmte >= N ) { number_of_rows_rmte = 0; first_row_rmte = N-1; }; first_row_A_array[i] = first_row_rmte * N; first_row_B_array[i] = first_row_rmte; n_of_rows_A_array[i] = number_of_rows_rmte * N; n_of_rows_B_array[i] = number_of_rows_rmte ; //MPI_Isend( &A[first_row_rmte * N], N * number_of_rows_rmte, MPI_FLOAT, i,0, MPI_COMM_WORLD, &request); //MPI_Isend( &B[first_row_rmte], number_of_rows_rmte, MPI_FLOAT, i,0, MPI_COMM_WORLD, &request); } } /* Receiver side */ /* else { if ( number_of_rows > 0 && first_row < N) { //MPI_Recv( &A[first_row * N], N * number_of_rows, MPI_FLOAT, SOURCE, 0, MPI_COMM_WORLD, &status); //MPI_Recv( &B[first_row], number_of_rows, MPI_FLOAT, SOURCE, 0, MPI_COMM_WORLD, &status); } }*/ MPI_Scatterv( &A[0], // send buffer n_of_rows_A_array, // array with number of elements in each chunk first_row_A_array, // array with pointers to initial element of each chunk MPI_FLOAT, // type of elements to send &A[first_row * N], // receive buffer N * number_of_rows, // number of elements to receive MPI_FLOAT, // type of elements to receive SOURCE, // who sends MPI_COMM_WORLD ); MPI_Scatterv( &B[0], n_of_rows_B_array, first_row_B_array, MPI_FLOAT, &B[first_row], number_of_rows, MPI_FLOAT, SOURCE, MPI_COMM_WORLD ); /*printf("\nProcess %d: Iteration number %d of %d\n", my_rank, norm+1, N-1); print_A();*/ /* --------------------------------------- */ /* Gaussian elimination */ /* The arrays only have the needed values */ /* -------------------------------------- */ if ( number_of_rows > 0 && first_row < N) { /* Similar code than in the sequential case */ for (row = first_row; row <= last_row; row++) { multiplier = A[N*row + norm] / A[norm + N*norm]; for (col = norm; col < N; col++) { A[col+N*row] -= A[N*norm + col] * multiplier; } B[row] -= B[norm] * multiplier; } } /* --------------------------------------- */ /* Send back the results */ /* -------------------------------------- */ /* Sender side */ if ( my_rank != SOURCE ) { if ( number_of_rows > 0 && first_row < N) { MPI_Isend( &A[first_row * N], N * number_of_rows, MPI_FLOAT, SOURCE,0, MPI_COMM_WORLD, &request); MPI_Isend( &B[first_row], number_of_rows, MPI_FLOAT, SOURCE,0, MPI_COMM_WORLD, &request); } } /* Receiver side */ else { for ( i = 1; i < p; i++ ) { // In case this process isn't assigned any task, continue. This happens when there are more processors than rows if( n_of_rows_B_array[i] < 1 || first_row_B_array[i] >= N) continue; MPI_Recv( &A[ first_row_A_array[i] ], n_of_rows_A_array[i] , MPI_FLOAT, i,0, MPI_COMM_WORLD, &status ); MPI_Recv( &B[ first_row_B_array[i] ], n_of_rows_B_array[i] , MPI_FLOAT, i,0, MPI_COMM_WORLD, &status ); } } /* MPI_Gatherv( &A[first_row * N], // send buffer N * number_of_rows, // number of elements to send MPI_FLOAT, // type of elements to send &A[0], // receive buffer n_of_rows_A_array, // array with number of elements in each chunk first_row_A_array, // array with pointers to initial element of each chunk, in the reception buffer MPI_FLOAT, // type of elements to receive SOURCE, // who receives MPI_COMM_WORLD ); MPI_Gatherv( &B[first_row], number_of_rows, MPI_FLOAT, &B[0], n_of_rows_B_array, first_row_B_array, MPI_FLOAT, SOURCE, MPI_COMM_WORLD ); */ } }
int main(int argc, char *argv[]) { int m,n,c,iters; int my_m, my_n, my_rank, num_procs, recv_count, my_recv_count, block_size, smallest_block_size; float kappa; image u, u_bar; unsigned char *image_chars, *my_image_chars, *new_image_chars, *my_new_image_chars; char *input_jpeg_filename, *output_jpeg_filename; my_rank = 0; char * kappa_str; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &num_procs); int displs[num_procs],recvDispls[num_procs],sendCounts[num_procs], recvCounts[num_procs]; int i,my_m_rest; /* *read from command line: kappa,iters,input_jpeg_filename,output_jpeg_filename; */ input_jpeg_filename = argv[1];//riktig char output_jpeg_filename = argv[2];//riktig char kappa_str = (argv[3]);//maa konvertere til double iters = atoi(argv[4]);//maa konvertere til int //printf("iters: %d\n",iters); kappa = 0.01;//TODO:fix so that kappa can be read from the command line kappa = atof(kappa_str); if(my_rank==0){ import_JPEG_file(input_jpeg_filename, &image_chars, &m, &n, &c); } ///////////////////////////////////////////////////////////////// //Broadcasts the size from root(=0) to all the other processes.// ///////////////////////////////////////////////////////////////// MPI_Bcast(&m,1,MPI_INT,0,MPI_COMM_WORLD); MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); /* *divide the m x n pixels evenly among the MPI processes */ my_n = n;//this is correct my_m = (m-2)/num_procs;//without ghost points my_m_rest = (m-2)%num_procs; smallest_block_size = my_m*my_n; if(my_rank<my_m_rest){ my_m+=1; } printf("my_m: %d\n", my_m); block_size = my_m*n; ///////////////////////////////////////////////////////////////////////// //the last process get a larger my_m if m/num_procs is a decimal number// ///////////////////////////////////////////////////////////////////////// // if(my_rank==num_procs-1){ // my_m = my_m + (m-2)%num_procs; // } my_recv_count = my_m*my_n; ///////////////////////////////////////////////////// //this is the picture divided into two processes. // n--> // ----------------------- m // | | | // | 0 | v // ----------------------- // | | // | 1 | // ----------------------- /////////////////////////////////////////////////////// allocate_image(&u, my_m, my_n); allocate_image(&u_bar, my_m, my_n); my_image_chars = malloc((block_size+2*n)*(sizeof(int))); if(my_rank==0){ int last_displ=0; int current_block_size; for(i=0;i<my_m_rest;i++){ current_block_size = smallest_block_size + n; sendCounts[i] = current_block_size + 2*n; recvCounts[i] = current_block_size; displs[i] = current_block_size*i; recvDispls[i] = 0; //printf("sendCounts: %d\n", sendCounts[i]); printf("displ: %d\n",displs[i]/n); last_displ = displs[i]; } printf("rest: %d\n", my_m_rest); for(i=my_m_rest;i<num_procs;i++){ printf("%d\n",i); current_block_size = smallest_block_size; printf("%d\n", current_block_size); sendCounts[i] = current_block_size+2*n; recvCounts[i] = current_block_size; if(i==0){ displs[i] = 0; }else{ displs[i] = displs[i-1] + current_block_size; } recvDispls[i] = 0; //printf("sendCounts: %d\n", sendCounts[i]); printf("displ: %d\n", displs[i]/n); } } /* *each process asks process 0 for partitiones region *of image_chars and copy the values into u */ //MPI_Scatterv(image_chars, sendCounts, displs, MPI_CHAR, my_image_chars, recv_count, MPI_CHAR, 0, MPI_COMM_WORLD); //MPI_Scatter(&image_chars, my_m*my_n,MPI_CHAR, &my_image_chars, my_m*my_n, MPI_CHAR, 0,MPI_COMM_WORLD);//assume first that there will be no extra rows //MPI_Scatter(image_chars, block_size, MPI_CHAR, my_image_chars, block_size, MPI_CHAR, 0, MPI_COMM_WORLD); MPI_Scatterv(image_chars, sendCounts, displs, MPI_CHAR, my_image_chars, block_size+2*n, MPI_CHAR, 0, MPI_COMM_WORLD); int start = 0; convert_char_to_float(my_image_chars, &u,my_m+2, my_n,start); //printf("%f", kappa); iso_diffusion_denoising(&u, &u_bar, kappa, iters); /* *each process sends its resulting content of u_bar to process 0 *process 0 receives from each process incoming vaules and *copy them into the designated region of image_chars */ //convert_float_to_char(&image_chars,&u,my_m, my_n,start); int x,y, pict_number,value; for(x=0;x<my_m+2;x++){ for(y=0;y<my_n;y++){ pict_number = x*n + y; value = (int)(u.image_data[x][y]); my_image_chars[pict_number] = (unsigned char) value; } } //MPI_Gather(my_image_chars, block_size, MPI_CHAR, image_chars, block_size, MPI_CHAR, 0,MPI_COMM_WORLD); //MPI_Gatherv(my_image_chars, block_size, MPI_CHAR, image_chars,recvCounts, displs, MPI_CHAR,0, MPI_COMM_WORLD); //MPI_Gatherv(my_image_chars, block_size+2*n, MPI_CHAR, image_chars, sendCounts, displs, MPI_CHAR, 0,MPI_COMM_WORLD); MPI_Send(my_image_chars,block_size+2*n, MPI_CHAR, 0,0, MPI_COMM_WORLD); int k,p; if(my_rank == 0){ //receive the computed my_image_chars from all processes my_new_image_chars = malloc(block_size*sizeof(int)); new_image_chars = malloc(n*m*sizeof(int)); for(i=0;i<n*m;i++){ new_image_chars[i] = 0; } for(i=0;i<num_procs;i++){ MPI_Recv(my_new_image_chars,sendCounts[i], MPI_CHAR,i,0,MPI_COMM_WORLD, MPI_STATUS_IGNORE); start = displs[i];//i*(sendCounts[i]-2*n); for(k=0;k<sendCounts[i];k++){ new_image_chars[start + k]= my_new_image_chars[k]; } } export_JPEG_file(output_jpeg_filename, new_image_chars,m,n,c,75); } deallocate_image(&u); deallocate_image(&u_bar); //printf("Hello World!\n"); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int rank, size, myrow, mycol, nx, ny, stride, cnt, i, j, errs, errs_in_place; double *sendbuf, *recvbuf; MPI_Datatype vec, block, types[2]; MPI_Aint displs[2]; int *scdispls; int blens[2]; MPI_Comm comm2d; int dims[2], periods[2], coords[2], lcoords[2]; int *sendcounts; MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); /* Get a 2-d decomposition of the processes */ dims[0] = 0; dims[1] = 0; MPI_Dims_create(size, 2, dims); periods[0] = 0; periods[1] = 0; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm2d); MPI_Cart_get(comm2d, 2, dims, periods, coords); myrow = coords[0]; mycol = coords[1]; /* if (rank == 0) printf("Decomposition is [%d x %d]\n", dims[0], dims[1]); */ /* Get the size of the matrix */ nx = 10; ny = 8; stride = nx * dims[0]; recvbuf = (double *) malloc(nx * ny * sizeof(double)); if (!recvbuf) { MPI_Abort(MPI_COMM_WORLD, 1); } sendbuf = 0; if (myrow == 0 && mycol == 0) { sendbuf = (double *) malloc(nx * ny * size * sizeof(double)); if (!sendbuf) { MPI_Abort(MPI_COMM_WORLD, 1); } } sendcounts = (int *) malloc(size * sizeof(int)); scdispls = (int *) malloc(size * sizeof(int)); MPI_Type_vector(ny, nx, stride, MPI_DOUBLE, &vec); blens[0] = 1; blens[1] = 1; types[0] = vec; types[1] = MPI_UB; displs[0] = 0; displs[1] = nx * sizeof(double); MPI_Type_struct(2, blens, displs, types, &block); MPI_Type_free(&vec); MPI_Type_commit(&block); /* Set up the transfer */ cnt = 0; for (i = 0; i < dims[1]; i++) { for (j = 0; j < dims[0]; j++) { sendcounts[cnt] = 1; /* Using Cart_coords makes sure that ranks (used by * sendrecv) matches the cartesian coordinates (used to * set data in the matrix) */ MPI_Cart_coords(comm2d, cnt, 2, lcoords); scdispls[cnt++] = lcoords[0] + lcoords[1] * (dims[0] * ny); } } SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]); MPI_Scatterv(sendbuf, sendcounts, scdispls, block, recvbuf, nx * ny, MPI_DOUBLE, 0, comm2d); if ((errs = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], 0))) { fprintf(stdout, "Failed to transfer data\n"); } /* once more, but this time passing MPI_IN_PLACE for the root */ SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]); MPI_Scatterv(sendbuf, sendcounts, scdispls, block, (rank == 0 ? MPI_IN_PLACE : recvbuf), nx * ny, MPI_DOUBLE, 0, comm2d); errs_in_place = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], (rank == 0)); if (errs_in_place) { fprintf(stdout, "Failed to transfer data (MPI_IN_PLACE)\n"); } errs += errs_in_place; if (sendbuf) free(sendbuf); free(recvbuf); free(sendcounts); free(scdispls); MPI_Type_free(&block); MPI_Comm_free(&comm2d); MTest_Finalize(errs); return MTestReturnValue(errs); }
int main (int argc, char **argv) { FILE *fp; double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL; double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL; int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size; int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size; int rank, size, sqrt_size, matrices_a_b_dimensions[4]; MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator; MPI_Status status; // used to manage the cartesian grid int dimensions[2], periods[2], coordinates[2], remain_dims[2]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* For square mesh */ sqrt_size = (int)sqrt((double) size); if(sqrt_size * sqrt_size != size){ if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n"); MPI_Abort(MPI_COMM_WORLD, -1); } // create a 2D cartesian grid dimensions[0] = dimensions[1] = sqrt_size; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator); MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); //v COORDINATES imas shranjene koordinate procesa RANK // create a row communicator remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator); // create a column communicator remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator); // getting matrices from files at rank 0 only // example: mpiexec -n 64 ./cannon matrix1 matrix2 [test] if (rank == 0){ int row, column; if ((fp = fopen (argv[1], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]); A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *)); for (row = 0; row < matrices_a_b_dimensions[0]; row++){ A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double)); for (column = 0; column < matrices_a_b_dimensions[1]; column++) fscanf(fp, "%lf", &A[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, -1); } if((fp = fopen (argv[2], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]); B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *)); for(row = 0; row < matrices_a_b_dimensions[2]; row++){ B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *)); for(column = 0; column < matrices_a_b_dimensions[3]; column++) fscanf(fp, "%lf", &B[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // need to check that the multiplication is possible given dimensions // matrices_a_b_dimensions[0] = row size of A // matrices_a_b_dimensions[1] = column size of A // matrices_a_b_dimensions[2] = row size of B // matrices_a_b_dimensions[3] = column size of B if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){ if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // this implementation is limited to cases where thematrices can be partitioned perfectly if( matrices_a_b_dimensions[0] % sqrt_size != 0 || matrices_a_b_dimensions[1] % sqrt_size != 0 || matrices_a_b_dimensions[2] % sqrt_size != 0 || matrices_a_b_dimensions[3] % sqrt_size != 0 ){ if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n" "all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n", matrices_a_b_dimensions[0],matrices_a_b_dimensions[1], matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size ); MPI_Abort(MPI_COMM_WORLD, -1); } } // send dimensions to all peers //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /*if(rank == 0) { int i; for(i = 1; i < size; i++){ MPI_Send(matrices_a_b_dimensions, 4, MPI_INT, i, 0, cartesian_grid_communicator); } } else { MPI_Recv(matrices_a_b_dimensions, 4, MPI_INT, 0, 0, cartesian_grid_communicator, &status); }*/ //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //has to be blocking, bcs it is used right afterwards... MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% A_rows = matrices_a_b_dimensions[0]; A_columns = matrices_a_b_dimensions[1]; B_rows = matrices_a_b_dimensions[2]; B_columns = matrices_a_b_dimensions[3]; // local metadata for A A_local_block_rows = A_rows / sqrt_size; A_local_block_columns = A_columns / sqrt_size; A_local_block_size = A_local_block_rows * A_local_block_columns; A_local_block = (double *) malloc (A_local_block_size * sizeof(double)); // local metadata for B B_local_block_rows = B_rows / sqrt_size; B_local_block_columns = B_columns / sqrt_size; B_local_block_size = B_local_block_rows * B_local_block_columns; B_local_block = (double *) malloc (B_local_block_size * sizeof(double)); // local metadata for C C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double)); // C needs to be initialized at 0 (accumulates partial dot-products) int i; for(i=0; i < A_local_block_rows * B_local_block_columns; i++){ C_local_block[i] = 0; } // full arrays only needed at root if(rank == 0){ A_array = (double *) malloc(sizeof(double) * A_rows * A_columns); B_array = (double *) malloc(sizeof(double) * B_rows * B_columns); C_array = (double *) malloc(sizeof(double) * A_rows * B_columns); // generate the 1D arrays of the matrices at root int row, column, i, j; for (i = 0; i < sqrt_size; i++){ for (j = 0; j < sqrt_size; j++){ for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < A_local_block_columns; column++){ A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = A[i * A_local_block_rows + row][j * A_local_block_columns + column]; } } for (row = 0; row < B_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = B[i * B_local_block_rows + row][j * B_local_block_columns + column]; } } } } // allocate output matrix C C = (double **) malloc(A_rows * sizeof(double *)); for(i=0; i<A_rows ;i++){ C[i] = (double *) malloc(B_columns * sizeof(double)); } } // send a block to each process //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /*if(rank == 0) { int i; for(i = 1; i < size; i++){ MPI_Send((A_array + (i * A_local_block_size)), A_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator); MPI_Send((B_array + (i * B_local_block_size)), B_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator); } for(i = 0; i < A_local_block_size; i++){ A_local_block[i] = A_array[i]; } for(i = 0; i < B_local_block_size; i++){ B_local_block[i] = B_array[i]; } } else { MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status); MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status); }*/ //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /*MPI_Scatter(A_array, A_local_block_size, //int send_count, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);*/ //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // fix initial arrangements before the core algorithm starts - fora je, da se preden se prvic zacne computational part of algo, moras ze bloke zamenjat... //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /*if(coordinates[0] != 0){ MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0, (coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status); } if(coordinates[1] != 0){ MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0, (coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status); }*/ //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // two independent scattervs one after another, so can be non-blocking, but barrier needed right afterwards, since data is than used... int *displsA[size]; int *displsB[size]; int *localblsizA[size]; int *localblsizB[size]; MPI_Request requests[2]; MPI_Status statuses[2]; for (int i=0; i<sqrt_size; i++){ for (int j=0; j<sqrt_size; j++){ displsA[i*sqrt_size + j] = (i*sqrt_size + (j+i)%sqrt_size)*A_local_block_size; displsB[i*sqrt_size + j] = (j + ((j+i)%size)*sqrt_size)*B_local_block_size; localblsizA[i*sqrt_size+j] = A_local_block_size; localblsizB[i*sqrt_size+j] = B_local_block_size; } } MPI_IScatterv(A_array, localblsizA, displsA, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[0]); MPI_IScatterv(B_array, localblsizB, displsB, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[1]); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //tega initial realignmenta ne bo treba, če boš že v A_array na zacetku alignano napisal data... ker te bo dovolj zgolj scatter. //Je pa Isaias tudi reku, da lahko to pustima in samo povema, da to pac ne gre prepisat, ker je if stavek in ne sodelujejo vsi ranki; // pogoj za collective je pa ravno to, da sodelujejo vsi! // //Je pa še ena moznost: scatterv! pa das primerne displacemente! //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // cannon's algorithm //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%555ta del je za SCATTER + GATHER int *dispA[sqrt_size]; int *dispB[sqrt_size]; int *localsizesA[sqrt_size]; int *localsizesB[sqrt_size]; if (coordinates[0]==0) { double *B_rowarray[sqrt_size*B_local_block]; } if (coordinates[1]==0){ double *A_rowarray[sqrt_size*A_local_block]; } for (int i=0; i<sqrt_size; i++){ dispA[i] = ((i+1)%sqrt_size)*A_local_block_size; dispB[i] = ((i+1)%sqrt_size)*B_local_block_size; localsizesA[i] = A_local_block_size; localsizesB[i] = B_local_block_size; } //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%555 int cannon_block_cycle; double compute_time = 0, mpi_time = 0, start; int C_index, A_row, A_column, B_column; MPI_Waitall(2, requests, statuses); for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){ // compute partial result for this block cycle start = MPI_Wtime(); for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){ for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){ for(A_column = 0; A_column < A_local_block_columns; A_column++){ C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * B_local_block[A_column * B_local_block_columns + B_column]; } } } compute_time += MPI_Wtime() - start; //start = MPI_Wtime(); // rotate blocks horizontally /*MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, //to bi slo z MPI_alltoallv, in tisto variablo za replacing. ampak bi blo inefficient - glej komentarje! (coordinates[1] + sqrt_size - 1) % sqrt_size, 0, (coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status); // rotate blocks vertically MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - 1) % sqrt_size, 0, (coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status); mpi_time += MPI_Wtime() - start; */ //ce uporabis sendrecv, imas skupno v vsaki vrsti/stolpu sqrt_size komunikacij (oz SQRT_SIZE posiljanj + SQRT_SIZE prejemanj),, //ce bi dala alltoall pa bi jih (kljub temu, da bi ponekod posiljal bloke velikosti 0) imel SIZE. Kar je pa tut ful inefficient(glej komentarje): /*This is allowed by the standard, but be warned that it is likely to perform poorly compared to what could be done with point-to-point or one-sided operations if most links are empty. ! ! ! ! ! ! ! ! */ //lahko pa nardis to z gather+scatter, pa po en rank v vsaki vrsti/stolpu gathera vse, in jih zashiftano nazaj poslje. But that would still mean // 2*SQRT_SIZE communications, and it would have to be blocking, since data is used right afterwards. It might sound good to do this because even though //you need same amount of communication, the collectives are optimized; so the sole communcation should in this case take less time. However it's probably //not that big of a difference... lahko pa vseeno probata? //An even better idea seems to be if you figure out the pattern in which the blocks are shifted, and only use the A_array to scatter it from rnk 0 //in the right order to all other ranks... This way we would all in all need SIZE communications (rnk 0 with everyone else) while with the previous //way we would all together need num_rows/colums*2*SQRT_SIZE, which is twice more. However in this last way, we would also need to compute the right //indeces for scatter everytime? //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% SCATTER + GATHER //rabis se rezervacijo prostora, za tiste ranke k si shranjo vse bloke neke vrste/stolpa start = MPI_Wtime(); MPI_Gather(A_local_block, A_local_block_size, MPI_DOUBLE, A_rowarray, A_local_block_size, MPI_DOUBLE, coordinates[0]*sqrt_size, row_communicator); MPI_Gather(B_local_block, B_local_block_size, MPI_DOUBLE, B_rowarray, B_local_block_size, MPI_DOUBLE, coordinates[1], column_communicator); MPI_Scatterv(A_rowarray, localsizesA, dispA, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, coordinates[0]*sqrt_size, row_communicator); MPI_Scatterv(B_rowarray, localsizesB, dispB, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, coordinates[1], column_communicator); mpi_time += MPI_Wtime() - start; //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% SCATTER from origin start = MPI_Wtime(); //al bi se to moglo merit sele spodej, za zankami? for (int i=0; i<sqrt_size-1; i++){ for (int j=0; j<sqrt_size-1; j++){ displsA[i*sqrt_size + j] += A_local_block_size; displsB[i*sqrt_size + j] += B_local_block_size*size_sqrt; } } for (int i=0; i<sqrt_size; i++){ displsA[size - sqrt_size + i] -= A_local_block_size*(sqrt_size-1); displsB[size - sqrt_size + i] -= B_local_block_size*(sqrt_size-1)*size_sqrt; } MPI_Scatterv(A_array, localblsizA, displsA, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); MPI_Scatterv(B_array, localblsizB, displsB, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); mpi_time += MPI_Wtime() - start; //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% } // get C parts from other processes at rank 0 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% if(rank == 0) { for(i = 0; i < A_local_block_rows * B_local_block_columns; i++){ C_array[i] = C_local_block[i]; } int i; for(i = 1; i < size; i++){ MPI_Recv(C_array + (i * A_local_block_rows * B_local_block_columns), A_local_block_rows * B_local_block_columns, MPI_DOUBLE, i, 0, cartesian_grid_communicator, &status); } } else { MPI_Send(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, 0, cartesian_grid_communicator); }*/ //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, cartesian_grid_communicator); //blocking, ker gres takoj nekaj delat s tem pol... right? //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // generating output at rank 0 if (rank == 0) { // convert the ID array into the actual C matrix int i, j, k, row, column; for (i = 0; i < sqrt_size; i++){ // block row index for (j = 0; j < sqrt_size; j++){ // block column index for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) + (row * B_local_block_columns) + column]; } } } } printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns); printf("Computation time: %lf\n", compute_time); printf("MPI time: %lf\n", mpi_time); if (argc == 4){ // present results on the screen printf("\nA( %d x %d ):\n", A_rows, A_columns); for(row = 0; row < A_rows; row++) { for(column = 0; column < A_columns; column++) printf ("%7.3f ", A[row][column]); printf ("\n"); } printf("\nB( %d x %d ):\n", B_rows, B_columns); for(row = 0; row < B_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ", B[row][column]); printf("\n"); } printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns); for(row = 0; row < A_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ",C[row][column]); printf("\n"); } printf("\nPerforming serial consistency check. Be patient...\n"); fflush(stdout); int pass = 1; double temp; for(i=0; i<A_rows; i++){ for(j=0; j<B_columns; j++){ temp = 0; for(k=0; k<B_rows; k++){ temp += A[i][k] * B[k][j]; } printf("%7.3f ", temp); if(temp != C[i][j]){ pass = 0; } } printf("\n"); } if (pass) printf("Consistency check: PASS\n"); else printf("Consistency check: FAIL\n"); } } // free all memory if(rank == 0){ int i; for(i = 0; i < A_rows; i++){ free(A[i]); } for(i = 0; i < B_rows; i++){ free(B[i]); } for(i = 0; i < A_rows; i++){ free(C[i]); } free(A); free(B); free(C); free(A_array); free(B_array); free(C_array); } free(A_local_block); free(B_local_block); free(C_local_block); // finalize MPI MPI_Finalize(); }
bool scatter(){ int i, j; int count; int count_tot; int* count_root; int* displ; MPI_Bcast(&idx, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&idy, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&idz, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&iddx, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&iddy, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&iddz, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&qch, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&dV, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&dAdrop, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&dApart, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&droplet, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&length, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&AnchNInf, 1, MPI_BYTE, root, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); //define shared window and store Qold of root processor to q for all processors to access MPI_Win_allocate_shared(6 * length * sizeof(float), 1, MPI_INFO_NULL, shmcomm, &q, &win); MPI_Scatter(Qold, 6 * length, MPI_FLOAT, q, 6 * length, MPI_FLOAT, root, MPI_COMM_WORLD); //populate share at root processor to sign at all processors sign = (int*)malloc(length * sizeof(int)); for(i = 0; i < length; i ++) sign[i] = -1; MPI_Scatter(share, length, MPI_INT, sign, length, MPI_INT, root, MPI_COMM_WORLD); //Allocate Qnew(qn) qn = (float*)malloc(6 * length * sizeof(float)); for(i = 0; i < 6 * length; i ++) qn[i] = q[i]; //populate neighbor at root processor to neigb at all processors neigb = (int*)malloc(6 * length * sizeof(int)); MPI_Scatter(neighbor, 6 * length, MPI_INT, neigb, 6 * length, MPI_INT, root, MPI_COMM_WORLD); //Adjust the index for different processors to access q in shared window for(i = 0; i < 6 * length; i ++){ neigb[i] -= length * myid; } // printf("%d:\t%d\t%d\t%d\t%d\t%d\t%d.\n", myid, neigb[0], neigb[1], neigb[2], neigb[3], neigb[4], neigb[5]); //Verify the number of droplet and boundary. If not consistent, report error. count = 0; for(i = 0; i < length; i ++){ if(sign[i] >= 0 && sign[i] < 10) count ++; } MPI_Reduce(&count, &count_tot, 1, MPI_INT, MPI_SUM, root, MPI_COMM_WORLD); if(myid == root && count_tot != droplet){ printf("Error in scatter. Counted number %d is not equal to droplet %d.\n", count_tot, droplet); return false; } count = 0; for(i = 0; i < length; i ++){ if(sign[i] >= 2 && sign[i] < 10) count ++; } MPI_Reduce(&count, &count_tot, 1, MPI_INT, MPI_SUM, root, MPI_COMM_WORLD); if(myid == root && count_tot != surf){ printf("Error in scatter(boundary). Counted number %d is not equal to surface %d.\n", count_tot, surf); return false; } count *= 3; nu_p = (double*)malloc(count * sizeof(double)); count_root = (int*)malloc(numprocs * sizeof(int)); displ = (int*)malloc(numprocs * sizeof(int)); // if(myid == root) printf("Check3.\n"); // scatter nu and qo defined at boundary nodes to different processors. MPI_Gather(&count, 1, MPI_INT, count_root, 1, MPI_INT, root, MPI_COMM_WORLD); if(myid == root){ for(i = 0; i < numprocs; i ++){ displ[i] = 0; for(j = 0; j < i; j++){ displ[i] += count_root[j]; } } } MPI_Scatterv(nu, count_root, displ, MPI_DOUBLE, nu_p, count, MPI_DOUBLE, root, MPI_COMM_WORLD); if((degenerate == 0 && infinite == 0) || AnchNInf){ count *= 2; if(myid == root){ for(i = 0; i < numprocs; i ++){ count_root[i] *= 2; displ[i] *= 2; } } qo_p = (float*)malloc(count * sizeof(float)); MPI_Scatterv(Qo, count_root, displ, MPI_FLOAT, qo_p, count, MPI_FLOAT, root, MPI_COMM_WORLD); } // printf("check4.\n"); if(myid == root){ free(neighbor); free(Qold); free(share); free(nu); if((degenerate == 0 && infinite == 0) || AnchNInf) free(Qo); } free(count_root); free(displ); return true; }
int main(int argc, char** argv) { // Establece el tiempo inicial del programa clock_t t_start = clock(); int rank; int numtasks; int i; int stride; int vector[MAX]; for(i = 1; i <= 100; i++) vector[ i - 1 ] = i; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); stride = MAX/(numtasks); //printf("Stride: %d\n", stride); int vtmp[stride]; int disp[stride]; int sendcount[stride]; int acum; for(i = 0; i < numtasks; i++) { disp[i] = i * stride; sendcount[i] = stride; } // &vector: desde donde se van a tomar los datos // sendcount: cuantos datos voy a enviar // disp: cuanto es el desplazamiento relativo a sendbuff, apartir del cual toma valores el proceso i // sendtype: el tipo de dato que voy a enviar // &recvbuff: donde se van almacenar los datos // recvcount: cuantos datos va a recibir // recvtype: el tipo de dato que va a recibir // root: quien origina la distribucion de los datos // comm: comunicador de procesos // MPI_Scatterv(&sendbuff, sendcount, sendtype, &recvbuff, recvcount, recvtype, root, comm) MPI_Scatterv(vector, sendcount, disp, MPI_INT, vtmp, stride, MPI_INT, 0, MPI_COMM_WORLD); acum = 0; for(i = 0; i < stride; i++) { acum += vtmp[i]; } printf("Subtotal %d en nodo %d\n", acum, rank); MPI_Reduce(&acum, vtmp, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if(rank == 0) { printf("TOTAL: %d\n", vtmp[0]); // Establece el tiempo final del programa clock_t t_end = clock(); // Tiempo de ejecucion del programa clock_t t_run = t_end - t_start; printf ("Tiempo de Ejecucion: (%f segundos).\n",((float)t_run)/CLOCKS_PER_SEC); } MPI_Finalize(); return 0; }
int ORD::find_elim_ordering() { int ws; int wr; char eoname[512]; char eoname_other[512]; // Get size and rank from the communicator MPI_Comm_size(comm, &ws); MPI_Comm_rank(comm, &wr); double xtime = MPI_Wtime(); sprintf(eoname, "%s.order.%d", this->filename.c_str(), ws); sprintf(eoname_other, "%s.order_other.%d", this->filename.c_str(), ws); DEBUG("size: %d, rank %d \n", ws, wr); int n = G->get_num_nodes(); int x = n/ws; int xm = n%ws; int i = 0; DEBUG("n: %d x: %d xm: %d \n", n, x, xm); vector<int> xadj; vector<int> adjncy; vector<int> vtxdist(ws + 1, 0); vector<int> sizes(2*ws,0); vector<int> ordering(x+1, 0); vector<int> recvcnt(ws, 0); vector<int> displ(ws, 0); int numflag = 0; int options[10]; options[0] = 0; vtxdist[0] = 0; for (i = 1; i <= ws; i++) { vtxdist[i] = vtxdist[i - 1] + x; if (i <= xm) vtxdist[i]++; } // prepareing displacement and receive counts to use with MPI_Gatherv for (i = 0; i < ws; i++) { recvcnt[i] = x; if (i < xm) recvcnt[i] ++; if (i > 0) displ[i] += displ[i - 1] + recvcnt[i - 1]; } DEBUG("range: %d, %d\n", vtxdist[wr], vtxdist[wr + 1]); int j = 0; xadj.push_back(0); for (i = vtxdist[wr]; i < vtxdist[wr + 1]; i++) { Graph::Node *no = G->get_node(i); list<int> *l = no->get_nbrs_ptr(); list<int>::iterator it = l->begin(); for (; it != l->end(); ++it) { adjncy.push_back(*it); j++; } xadj.push_back(j); } if (METIS_OK != ParMETIS_V3_NodeND(&vtxdist.front(), &xadj.front(), &adjncy.front(), &numflag, options, &ordering.front(), &sizes.front(), &comm)) { FERROR("error occured while processing parmetis, aborting\n"); MPI_Abort(MPI_COMM_WORLD, -1); } DEBUG("output from ParMETIS\n"); double parmet_time = MPI_Wtime() - xtime; vector<int> recvbuf; n = G->get_num_nodes(); if (wr == 0) { recvbuf = vector<int>(n, 0); } if (MPI_SUCCESS != MPI_Gatherv((void *)&ordering.front(), recvcnt[wr], MPI_INT, (void *)&recvbuf.front(), &recvcnt.front(), &displ.front(), MPI_INT, 0, comm)) { FERROR("MPI error occured at Gatherv, Abort!\n"); MPI_Abort(comm, -1); } vector<int> eo(n, 0); if (wr == 0) { for (int i = 0; i < n; i++) { eo[recvbuf[i]] = i; } FILE *f = fopen(eoname_other, "w"); for (int i = 0; i < n; i++) fprintf(f, "%d\n", eo[i] + 1); fclose(f); DEBUG("ParMetis NodeND elimination ordering is in : %s\n", eoname_other); } ordering.clear(); ordering.resize(recvcnt[wr], 0); if (MPI_SUCCESS != MPI_Scatterv ((void *)&eo.front(), &recvcnt.front(), &displ.front(), MPI_INT, (void *)&ordering.front(), recvcnt[wr], MPI_INT, 0, comm)) { FERROR("MPI error occured at Scatterv, Abort! \n"); MPI_Abort(comm, -1); } DEBUG("Scatterv completed\n"); Graph::GraphCreatorFile gf; Graph::WeightedMutableGraph *wg; Graph::GraphEOUtil eoutil; Graph::GraphProperties prop; list<int>members(ordering.begin(), ordering.end()); wg = gf.create_component(G, &members, false); prop.make_canonical(wg); vector<int> ord(recvcnt[wr], 0); vector<int> ordsend(recvcnt[wr, 0]); double xxtime = MPI_Wtime(); eoutil.find_elimination_ordering(wg, &ord, GD_AMD, false); DEBUG("eo time : %f\n", MPI_Wtime() - xxtime); int sz = recvcnt[wr]; for (int i = 0; i < sz; i++) ordsend[i] = wg->get_node(ord[i])->get_label(); recvbuf.assign(n, -1); if (MPI_SUCCESS != MPI_Gatherv((void *)&ordsend.front(), recvcnt[wr], MPI_INT, (void *)&recvbuf.front(), &recvcnt.front(), &displ.front(), MPI_INT, 0, comm)) { FERROR("MPI error occured at Gatherv, Abort!\n"); MPI_Abort(comm, -1); } double p_amd_time = MPI_Wtime() - xtime; if (wr == 0) { FILE *f = fopen(eoname, "w"); for (int i = 0; i < n && wr == 0; i++) fprintf(f, "%d\n", recvbuf[i]); fclose(f); } DEBUG("ordering is written into %s\n", eoname); DEBUG("%f,%f\n", parmet_time, p_amd_time); return 0; }
int main (int argc, char **argv) { FILE *fp; double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL; double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL; int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size; int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size; int rank, size, sqrt_size, matrices_a_b_dimensions[4]; MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator; MPI_Status status; // used to manage the cartesian grid int dimensions[2], periods[2], coordinates[2], remain_dims[2]; double init_time = 0.0, start; MPI_Init(&argc, &argv); start = MPI_Wtime(); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* For square mesh */ sqrt_size = (int)sqrt((double) size); if(sqrt_size * sqrt_size != size){ if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n"); MPI_Abort(MPI_COMM_WORLD, -1); } // create a 2D cartesian grid dimensions[0] = dimensions[1] = sqrt_size; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator); MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); // create a row communicator remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator); // create a column communicator remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator); // getting matrices from files at rank 0 only // example: mpiexec -n 64 ./cannon matrix1 matrix2 [test] if (rank == 0){ int row, column; if ((fp = fopen (argv[1], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]); A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *)); for (row = 0; row < matrices_a_b_dimensions[0]; row++){ A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double)); for (column = 0; column < matrices_a_b_dimensions[1]; column++) fscanf(fp, "%lf", &A[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, -1); } if((fp = fopen (argv[2], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]); B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *)); for(row = 0; row < matrices_a_b_dimensions[2]; row++){ B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *)); for(column = 0; column < matrices_a_b_dimensions[3]; column++) fscanf(fp, "%lf", &B[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // need to check that the multiplication is possible given dimensions // matrices_a_b_dimensions[0] = row size of A // matrices_a_b_dimensions[1] = column size of A // matrices_a_b_dimensions[2] = row size of B // matrices_a_b_dimensions[3] = column size of B if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){ if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // this implementation is limited to cases where thematrices can be partitioned perfectly if( matrices_a_b_dimensions[0] % sqrt_size != 0 || matrices_a_b_dimensions[1] % sqrt_size != 0 || matrices_a_b_dimensions[2] % sqrt_size != 0 || matrices_a_b_dimensions[3] % sqrt_size != 0 ){ if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n" "all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n", matrices_a_b_dimensions[0],matrices_a_b_dimensions[1], matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size ); MPI_Abort(MPI_COMM_WORLD, -1); } } // send dimensions to all peers /* @collectives: * MPI_Broadcast */ MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator); A_rows = matrices_a_b_dimensions[0]; A_columns = matrices_a_b_dimensions[1]; B_rows = matrices_a_b_dimensions[2]; B_columns = matrices_a_b_dimensions[3]; // local metadata for A A_local_block_rows = A_rows / sqrt_size; A_local_block_columns = A_columns / sqrt_size; A_local_block_size = A_local_block_rows * A_local_block_columns; A_local_block = (double *) malloc (A_local_block_size * sizeof(double)); // local metadata for B B_local_block_rows = B_rows / sqrt_size; B_local_block_columns = B_columns / sqrt_size; B_local_block_size = B_local_block_rows * B_local_block_columns; B_local_block = (double *) malloc (B_local_block_size * sizeof(double)); // local metadata for C C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double)); // C needs to be initialized at 0 (accumulates partial dot-products) int i; for(i=0; i < A_local_block_rows * B_local_block_columns; i++){ C_local_block[i] = 0; } // full arrays only needed at root if(rank == 0){ A_array = (double *) malloc(sizeof(double) * A_rows * A_columns); B_array = (double *) malloc(sizeof(double) * B_rows * B_columns); C_array = (double *) malloc(sizeof(double) * A_rows * B_columns); // generate the 1D arrays of the matrices at root int row, column, i, j; for (i = 0; i < sqrt_size; i++){ for (j = 0; j < sqrt_size; j++){ for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < A_local_block_columns; column++){ A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = A[i * A_local_block_rows + row][j * A_local_block_columns + column]; } } for (row = 0; row < B_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = B[i * B_local_block_rows + row][j * B_local_block_columns + column]; } } } } // allocate output matrix C C = (double **) malloc(A_rows * sizeof(double *)); for(i=0; i<A_rows ;i++){ C[i] = (double *) malloc(B_columns * sizeof(double)); } } // send a block to each process /* @collectives: /* MPI_Scatter with sendcount=A/B_local_block_size. The if-else clause and the for-loops can be replaced. */ { //compute displacements int row_displs[size]; // displacements for A int col_displs[size]; // displacements for B int row, col; for(row = 0; row < sqrt_size; ++row) { for(col = 0; col < sqrt_size; ++col) { int i = row*sqrt_size + col; if(row != 0) { int col_loc = (col + sqrt_size - row) % sqrt_size; row_displs[i] = row*sqrt_size + col_loc; } else { row_displs[i] = i; } row_displs[i] *= A_local_block_size; if(col != 0) { int row_loc = (row + sqrt_size - col) % sqrt_size; col_displs[i] = row_loc*sqrt_size + col; } else { col_displs[i] = i; } col_displs[i] *= B_local_block_size; } } // set counts for scattering A; int counts[size]; int i; for(i = 0; i < size; ++i) { counts[i] = A_local_block_size; } MPI_Scatterv(A_array, counts, row_displs, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); for(i = 0; i < size; ++i) { counts[i] = B_local_block_size; } MPI_Scatterv(B_array, counts, col_displs, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); } init_time += MPI_Wtime() - start; // cannon's algorithm int cannon_block_cycle; double compute_time = 0, mpi_time = 0; int C_index, A_row, A_column, B_column; for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){ // compute partial result for this block cycle start = MPI_Wtime(); for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){ for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){ for(A_column = 0; A_column < A_local_block_columns; A_column++){ C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * B_local_block[A_column * B_local_block_columns + B_column]; } } } compute_time += MPI_Wtime() - start; start = MPI_Wtime(); // rotate blocks horizontally MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - 1) % sqrt_size, 0, (coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status); // rotate blocks vertically MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - 1) % sqrt_size, 0, (coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status); mpi_time += MPI_Wtime() - start; } // get C parts from other processes at rank 0 /* @collectives: * MPI_Gather with sendcount=A_local_block_rows * B_local_block_columns */ double output_time = 0.0; start = MPI_Wtime(); MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, cartesian_grid_communicator); output_time += MPI_Wtime() - start; // generating output at rank 0 if (rank == 0) { // convert the ID array into the actual C matrix int i, j, k, row, column; for (i = 0; i < sqrt_size; i++){ // block row index for (j = 0; j < sqrt_size; j++){ // block column index for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) + (row * B_local_block_columns) + column]; } } } } printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns); printf("Computation time: %lf\n", compute_time); printf("MPI time: %lf\n", mpi_time); printf("Setup time: %lf\n", init_time); printf("Output time: %lf\n", output_time); if (argc == 4){ // present results on the screen printf("\nA( %d x %d ):\n", A_rows, A_columns); for(row = 0; row < A_rows; row++) { for(column = 0; column < A_columns; column++) printf ("%7.3f ", A[row][column]); printf ("\n"); } printf("\nB( %d x %d ):\n", B_rows, B_columns); for(row = 0; row < B_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ", B[row][column]); printf("\n"); } printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns); for(row = 0; row < A_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ",C[row][column]); printf("\n"); } printf("\nPerforming serial consistency check. Be patient...\n"); fflush(stdout); int pass = 1; double temp; for(i=0; i<A_rows; i++){ for(j=0; j<B_columns; j++){ temp = 0; for(k=0; k<B_rows; k++){ temp += A[i][k] * B[k][j]; } printf("%7.3f ", temp); if(temp != C[i][j]){ pass = 0; } } printf("\n"); } if (pass) printf("Consistency check: PASS\n"); else printf("Consistency check: FAIL\n"); } } // free all memory if(rank == 0){ int i; for(i = 0; i < A_rows; i++){ free(A[i]); } for(i = 0; i < B_rows; i++){ free(B[i]); } for(i = 0; i < A_rows; i++){ free(C[i]); } free(A); free(B); free(C); free(A_array); free(B_array); free(C_array); } free(A_local_block); free(B_local_block); free(C_local_block); // finalize MPI MPI_Finalize(); }
int main(int argc, char *argv[]) { double startTime, endTime; int numElements, offset, stripSize, myrank, numnodes, N, i, j, k,x; int tamanio[numnodes]; int desplazamiento[numnodes]; int resto; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &numnodes); N = atoi(argv[1]); resto = N % numnodes; double A[N][N], B[N][N], C[N][N]; double auxA[N][N], auxC[N][N]; if (myrank == 0) { // inicializar A y B x=0; for (i=0; i<N; i++) { for (j=0; j<N; j++) { A[i][j] = x; B[i][j] = x; x++; } } } // Empezar el contador if (myrank == 0) { startTime = MPI_Wtime(); } numElements = N/numnodes; //Tamaño de los datos con los que opera cada trabajador. desplazamiento[0] = 0; for(j=0;j < (numnodes - resto) ; j++){ tamanio[j] = numElements * N; desplazamiento[j+1] = desplazamiento[j] + (numElements * N); } for( j = numnodes - resto ; j < numnodes; j++){ tamanio[j] = (N * (numElements + 1)); if( j != numnodes - 1){ desplazamiento[j+1] = desplazamiento[j] + (N * (numElements + 1)); } } for(i = 0; i < numnodes ; i++){ printf("desplazamiento[%d] = %d \n",i,desplazamiento[i]); printf("tamanio[%d] = %d \n",i,tamanio[i]); } //El master realiza un envio al resto de workers con la matriz A MPI_Scatterv(&A,tamanio,desplazamiento,MPI_DOUBLE, &auxA, tamanio[myrank], MPI_DOUBLE, 0,MPI_COMM_WORLD); // Todos obtienen B MPI_Bcast(&B, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Cada proceso inicializa C a 0. for (i=0; i<N; i++) { for (j=0; j<N; j++) { C[i][j] = 0.0; } } // Realizar operaciones for (i=0; i<N; i++) { for (j=0; j<N; j++) { for (k=0; k<N; k++) { auxC[i][j] += auxA[i][k] * B[k][j]; } } } //El master junta los datos de la matriz con la solución MPI_Gatherv(&auxC,tamanio[myrank],MPI_DOUBLE,&C,tamanio,desplazamiento,MPI_DOUBLE,0,MPI_COMM_WORLD); // para el contador if (myrank == 0) { endTime = MPI_Wtime(); } // Imprime la matriz A if (myrank == 0 && N < 10) { printf("Matriz A:\n"); for (i=0; i<N; i++) { for (j=0; j<N; j++) { printf("%f ", A[i][j]); } printf("\n"); } } // Imprime la matriz B if (myrank == 0 && N < 10) { printf("\n"); printf("Matriz B:\n"); for (i=0; i<N; i++) { for (j=0; j<N; j++) { printf("%f ", B[i][j]); } printf("\n"); } } // Imprime la matriz C if (myrank == 0 && N < 10) { printf("\n"); printf("Matriz C:\n"); for (i=0; i<N; i++) { for (j=0; j<N; j++) { printf("%f ", C[i][j]); } printf("\n"); } } if (myrank == 0) { printf("\n"); printf("Ha tardado %f segundos.\n\n", endTime-startTime); printf("\n"); } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int numprocs, rank, namelen, i; char processor_name[MPI_MAX_PROCESSOR_NAME]; int *vx = NULL, *vy = NULL, *vz = NULL, *vxpart = NULL, *vypart = NULL, *vzpart = NULL, coeff[2]; int exp = 0, act = 0; int *count = NULL; int *disp = NULL; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* let the main process initialize the data */ if (rank == 0) { vx = (int *) malloc(sizeof(int) * DIM_GLOBAL); vy = (int *) malloc(sizeof(int) * DIM_GLOBAL); vz = (int *) malloc(sizeof(int) * DIM_GLOBAL); for (i = 0; i < DIM_GLOBAL; i++) { vx[i] = i; vy[i] = i; vz[i] = 0; exp += 2 * i + 3 * i; } coeff[0] = 2; coeff[1] = 3; } /* compute size of chunks */ sendcounts_array(&count, numprocs, DIM_GLOBAL); displs_array(&disp, count, numprocs); /* allocate work buffer to all, including the master process */ vxpart = (int *) malloc(sizeof(int) * count[rank]); vypart = (int *) malloc(sizeof(int) * count[rank]); vzpart = (int *) malloc(sizeof(int) * count[rank]); /* Scatter the data to peers */ MPI_Scatterv(vx, count, disp, MPI_INTEGER, vxpart, count[rank], MPI_INTEGER, 0, MPI_COMM_WORLD); MPI_Scatterv(vy, count, disp, MPI_INTEGER, vypart, count[rank], MPI_INTEGER, 0, MPI_COMM_WORLD); /* Broadcast is done here because coeff is the same for all computations */ MPI_Bcast(coeff, 2, MPI_INTEGER, 0, MPI_COMM_WORLD); /* perform the actual computation */ for (i = 0; i < count[rank]; i++) { vzpart[i] = coeff[0] * vxpart[i] + coeff[1] * vypart[i]; } /* Gather the results */ MPI_Gatherv(vzpart, count[rank], MPI_INTEGER, vz, count, disp, MPI_INTEGER, 0, MPI_COMM_WORLD); /* verify result */ if (rank == 0) { for (i = 0; i < DIM_GLOBAL; i++) { act += vz[i]; } printf("exp=%d act=%d\n", exp, act); } if (rank == 0) { FREE(vx); FREE(vy); FREE(vz); } FREE(vxpart); FREE(vypart); FREE(vzpart); FREE(disp); FREE(count); MPI_Get_processor_name(processor_name, &namelen); MPI_Finalize(); return 0; }
void GenVector_ReadCSV(denseType * vector, long length, long num_cols, char* rhsFile, int myid, int numprocs) { long idx; long local_length, local_length_normal; int ierr; double * Total_data_buffer; int sendCount[numprocs]; int sendDispls[numprocs]; long procCounter; double normel_ele_num; ierr = MPI_Bcast((void*) &length, 1, MPI_LONG, 0, MPI_COMM_WORLD); #ifdef GETRHS_DEBUG printf("in GetRHS.c, myid=%d, length=%d\n", myid, length); #endif local_length_normal = length / numprocs; if (myid == numprocs - 1) local_length = length - (numprocs - 1) * local_length_normal; else local_length = local_length_normal; normel_ele_num = local_length_normal * num_cols; for (procCounter = 0; procCounter < numprocs; procCounter++) { sendCount[procCounter] = (int)normel_ele_num; sendDispls[procCounter] = (int)procCounter * normel_ele_num; } sendCount[numprocs - 1] = (int)((length - (numprocs - 1) * local_length_normal) * num_cols); #ifdef GETRHS_DEBUG printf("in GetRHS.c, myid = %d, local_length=%d\n", myid, local_length); #endif vector->local_num_row = local_length; vector->local_num_col = num_cols; // only consider vector->global_num_row = length; vector->global_num_col = num_cols; #ifdef GETRHS_DEBUG #endif vector->data = (double *) calloc(vector->local_num_row * vector->local_num_col, sizeof (double)); long local_num_element = vector->local_num_row * vector->local_num_col; // rank 0 read CSV if (myid == 0) { printf("Reading MRHS data from %s ... ...\n", rhsFile); parseCSV(rhsFile, &Total_data_buffer, length, num_cols); printf("Reading MRHS data from %s done.\n", rhsFile); #ifdef GenVector_ReadCSV_DB //check_csv_array_print(Total_data_buffer, length, num_cols, myid); // exit(0); #endif } // // Scatter data // int MPI_Scatterv(const void *sendbuf, const int *sendcounts, const int *displs, // MPI_Datatype sendtype, void *recvbuf, int recvcount, // MPI_Datatype recvtype, // int root, MPI_Comm comm) ierr = MPI_Scatterv((void*) Total_data_buffer, (int*)sendCount, (int*)sendDispls, MPI_DOUBLE, vector->data, (int)local_num_element, MPI_DOUBLE, 0, MPI_COMM_WORLD); // // // based on the assumption of equal division of rows among processes vector->start_idx = myid * vector->global_num_col * local_length_normal; #ifdef GenVector_ReadCSV_DB // if (myid == 0){ // check_csv_array_print(vector->data, vector->local_num_row, vector->global_num_col, myid); // printf ("local rows:%d, local cols %d, local nnz:%d\n", vector->local_num_row, vector->local_num_col,local_num_element); // } if (myid == numprocs-1) { local_dense_mat_print(*vector, myid); } exit(0); #endif if (myid == 0) { free(Total_data_buffer); } }
int main(int argc, char *argv[]) { int m, n, c, iters, i, j; int my_m, my_n, my_rank, num_procs, size; float kappa; image u, u_bar; unsigned char *image_chars; char *input_jpeg_filename, *output_jpeg_filename; int* sendcounts, displs, recvcounts; sendcounts = (int*)malloc(num_procs*sizeof(int)); displs = (int*)malloc(num_procs*sizeof(int)); recvcounts = (int*)malloc(num_procs*sizeof(int)); printf("Now in main program\n"); MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &my_rank); MPI_Comm_size (MPI_COMM_WORLD, &num_procs); /* read from kommand line: kappa, iters, input_jpeg filename, output_jpeg_filename */ kappa = atof(argv[1]); iters = atoi(argv[2]); input_jpeg_filename = argv[3]; output_jpeg_filename = argv[4]; /* Test that parameters are read correctly from command line: printf("kappa: %f\n", kappa); printf("iters: %d\n", iters); printf("input_jpeg_filename: %s\n", input_jpeg_filename); printf("output_jpeg_filename: %s\n", output_jpeg_filename); */ if (my_rank==0) import_JPEG_file(input_jpeg_filename, &image_chars, &m, &n, &c); printf("Successfully imported JPEG image.\n"); MPI_Bcast (&m, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast (&n, 1, MPI_INT, 0, MPI_COMM_WORLD); /* Divide the m x n pixels evenly among the MPI processes */ my_m = m/num_procs; my_n = n; /* If the pixels cannot be evenly divided, the last process picks up */ /* the remainder. */ /* Each process needs the rows above and below it. */ /* The first and last process only need 1 additional row. */ if (my_rank == num_procs - 1){ my_m += m % num_procs; allocate_image(&u, my_m+1, my_n); allocate_image(&u_bar, my_m+1, my_n); } else if (my_rank == 0){ allocate_image(&u_bar, my_m+1, my_n); } else { allocate_image (&u, my_m+2, my_n); allocate_image (&u_bar, my_m+2, my_n); } /* Each process asks process 0 for a partitioned region */ /* of image_chars and copy the values into u */ if (my_rank==0){ size = (my_m + 1)*my_n; sendcounts[my_rank] = size; displs[my_rank] = my_rank; displs[my_rank+1] = my_n*(my_rank*my_m - 1); } else if (my_rank==num_procs-1){ size = (my_m + 1)*my_n; sendcounts[my_rank] = size; } else { size = (my_m + 2)*my_n; sendcounts[my_rank] = size; displs[my_rank+1] = my_n*(my_rank*my_m - 1); } MPI_Scatterv(&image_chars, &sendcounts, &displs, MPI_UNSIGNED_CHAR, &u.image_data, size, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD); /* Convert data type from unsigned char to float: */ for (i=0; i<my_m; i++) { for (j=0; j<my_n; j++) { u.image_data[i][j] = (float)u.image_data[i][j]; } } iso_diffusion_denoising (&u, &u_bar, kappa, iters); /* Each process must convert the data type in u back */ /* to unsigned char. */ for (i=0; i<my_m; i++) { for (j=0; j<my_n; j++) { u.image_data[i][j] = (unsigned char)u.image_data[i][j]; } } /* Each process sends its resulting content of u to process 0 */ /* Process 0 recieves from each process incoming values and */ /* copy them into the designated region of image_chars */ /* ... */ if (my_rank==0){ displs[my_rank] = 0; displs[my_rank+1] = my_rank*my_m*my_n; size = my_m*my_n if (my_rank==0) c = 1; export_JPEG_file(output_jpeg_filename, image_chars, m, n, c, 75); printf("Successfully exported JPEG image! \n"); deallocate_image(&u); deallocate_image(&u_bar); MPI_Finalize (); printf("Finished the program!\n"); return 0; }