inline void MyMPI_Send (FlatArray<T, BASE> s, int dest) { MPI_Send( &s.First(), s.Size(), MyGetMPIType<T>(), dest, 1, MPI_COMM_WORLD); }
/*@C PetscMallocDumpLog - Dumps the log of all calls to PetscMalloc(); also calls PetscMemoryGetMaximumUsage() Collective on PETSC_COMM_WORLD Input Parameter: . fp - file pointer; or NULL Options Database Key: . -malloc_log - Activates PetscMallocDumpLog() Level: advanced Fortran Note: The calling sequence in Fortran is PetscMallocDumpLog(integer ierr) The fp defaults to stdout. .seealso: PetscMallocGetCurrentUsage(), PetscMallocDump(), PetscMallocSetDumpLog() @*/ PetscErrorCode PetscMallocDumpLog(FILE *fp) { PetscInt i,j,n,dummy,*perm; size_t *shortlength; int *shortcount,err; PetscMPIInt rank,size,tag = 1212 /* very bad programming */; PetscBool match; const char **shortfunction; PetscLogDouble rss; MPI_Status status; PetscErrorCode ierr; PetscFunctionBegin; ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRQ(ierr); ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRQ(ierr); /* Try to get the data printed in order by processor. This will only sometimes work */ err = fflush(fp); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file"); ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr); if (rank) { ierr = MPI_Recv(&dummy,1,MPIU_INT,rank-1,tag,MPI_COMM_WORLD,&status);CHKERRQ(ierr); } if (PetscLogMalloc < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"PetscMallocDumpLog() called without call to PetscMallocSetDumpLog() this is often due to\n setting the option -malloc_log AFTER PetscInitialize() with PetscOptionsInsert() or PetscOptionsInsertFile()"); if (!fp) fp = PETSC_STDOUT; ierr = PetscMemoryGetMaximumUsage(&rss);CHKERRQ(ierr); if (rss) { ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] Maximum memory PetscMalloc()ed %.0f maximum size of entire process %.0f\n",rank,(PetscLogDouble)TRMaxMem,rss);CHKERRQ(ierr); } else { ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] Maximum memory PetscMalloc()ed %.0f OS cannot compute size of entire process\n",rank,(PetscLogDouble)TRMaxMem);CHKERRQ(ierr); } shortcount = (int*)malloc(PetscLogMalloc*sizeof(int));if (!shortcount) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory"); shortlength = (size_t*)malloc(PetscLogMalloc*sizeof(size_t));if (!shortlength) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory"); shortfunction = (const char**)malloc(PetscLogMalloc*sizeof(char*));if (!shortfunction) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory"); for (i=0,n=0; i<PetscLogMalloc; i++) { for (j=0; j<n; j++) { ierr = PetscStrcmp(shortfunction[j],PetscLogMallocFunction[i],&match);CHKERRQ(ierr); if (match) { shortlength[j] += PetscLogMallocLength[i]; shortcount[j]++; goto foundit; } } shortfunction[n] = PetscLogMallocFunction[i]; shortlength[n] = PetscLogMallocLength[i]; shortcount[n] = 1; n++; foundit:; } perm = (PetscInt*)malloc(n*sizeof(PetscInt));if (!perm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory"); for (i=0; i<n; i++) perm[i] = i; ierr = PetscSortStrWithPermutation(n,(const char**)shortfunction,perm);CHKERRQ(ierr); ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] Memory usage sorted by function\n",rank);CHKERRQ(ierr); for (i=0; i<n; i++) { ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] %d %.0f %s()\n",rank,shortcount[perm[i]],(PetscLogDouble)shortlength[perm[i]],shortfunction[perm[i]]);CHKERRQ(ierr); } free(perm); free(shortlength); free(shortcount); free((char**)shortfunction); err = fflush(fp); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file"); if (rank != size-1) { ierr = MPI_Send(&dummy,1,MPIU_INT,rank+1,tag,MPI_COMM_WORLD);CHKERRQ(ierr); } PetscFunctionReturn(0); }
static int fB(realtype t, N_Vector u, N_Vector uB, N_Vector uBdot, void *user_dataB) { realtype *uBdata, *duBdata, *udata; realtype uBLeft, uBRight, uBi, uBlt, uBrt; realtype uLeft, uRight, ui, ult, urt; realtype dx, hordc, horac, hdiff, hadv; realtype *z1, *z2, intgr1, intgr2; long int i, my_length; int npes, my_pe, my_pe_m1, my_pe_p1, last_pe, my_last; UserData data; realtype data_in[2], data_out[2]; MPI_Status status; MPI_Comm comm; /* Extract MPI info. from data */ data = (UserData) user_dataB; comm = data->comm; npes = data->npes; my_pe = data->my_pe; if (my_pe == npes) { /* This process performs the quadratures */ /* Obtain local arrays */ duBdata = NV_DATA_P(uBdot); my_length = NV_LOCLENGTH_P(uB); /* Loop over all other processes and load right hand side of quadrature eqs. */ duBdata[0] = ZERO; duBdata[1] = ZERO; for (i=0; i<npes; i++) { MPI_Recv(&intgr1, 1, PVEC_REAL_MPI_TYPE, i, 0, comm, &status); duBdata[0] += intgr1; MPI_Recv(&intgr2, 1, PVEC_REAL_MPI_TYPE, i, 0, comm, &status); duBdata[1] += intgr2; } } else { /* This process integrates part of the PDE */ /* Extract problem constants and work arrays from data */ dx = data->dx; hordc = data->hdcoef; horac = data->hacoef; z1 = data->z1; z2 = data->z2; /* Obtain local arrays */ uBdata = NV_DATA_P(uB); duBdata = NV_DATA_P(uBdot); udata = NV_DATA_P(u); my_length = NV_LOCLENGTH_P(uB); /* Compute related parameters. */ my_pe_m1 = my_pe - 1; my_pe_p1 = my_pe + 1; last_pe = npes - 1; my_last = my_length - 1; /* Pass needed data to processes before and after current process. */ if (my_pe != 0) { data_out[0] = udata[0]; data_out[1] = uBdata[0]; MPI_Send(data_out, 2, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm); } if (my_pe != last_pe) { data_out[0] = udata[my_length-1]; data_out[1] = uBdata[my_length-1]; MPI_Send(data_out, 2, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm); } /* Receive needed data from processes before and after current process. */ if (my_pe != 0) { MPI_Recv(data_in, 2, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm, &status); uLeft = data_in[0]; uBLeft = data_in[1]; } else { uLeft = ZERO; uBLeft = ZERO; } if (my_pe != last_pe) { MPI_Recv(data_in, 2, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm, &status); uRight = data_in[0]; uBRight = data_in[1]; } else { uRight = ZERO; uBRight = ZERO; } /* Loop over all grid points in current process. */ for (i=0; i<my_length; i++) { /* Extract uB at x_i and two neighboring points */ uBi = uBdata[i]; uBlt = (i==0) ? uBLeft: uBdata[i-1]; uBrt = (i==my_length-1) ? uBRight : uBdata[i+1]; /* Set diffusion and advection terms and load into udot */ hdiff = hordc*(uBlt - TWO*uBi + uBrt); hadv = horac*(uBrt - uBlt); duBdata[i] = - hdiff + hadv; /* Extract u at x_i and two neighboring points */ ui = udata[i]; ult = (i==0) ? uLeft: udata[i-1]; urt = (i==my_length-1) ? uRight : udata[i+1]; /* Load integrands of the two space integrals */ z1[i] = uBdata[i]*(ult - TWO*ui + urt)/(dx*dx); z2[i] = uBdata[i]*(urt - ult)/(TWO*dx); } /* Compute local integrals */ intgr1 = Xintgr(z1, my_length, dx); intgr2 = Xintgr(z2, my_length, dx); /* Send local integrals to 'quadrature' process */ MPI_Send(&intgr1, 1, PVEC_REAL_MPI_TYPE, npes, 0, comm); MPI_Send(&intgr2, 1, PVEC_REAL_MPI_TYPE, npes, 0, comm); } return(0); }
void send_values(double *v, int n, int child_id) { MPI_Send(&n, 1, MPI_INT, child_id, tag, MPI_COMM_WORLD); MPI_Send(v, n, MPI_DOUBLE, child_id, tag, MPI_COMM_WORLD); }
// =========================================================================== // matrix_mult_mpi() MPI version of dense matrix multiplication. // Processor cores are laid out in a 2D space // core_rows * core_cols layout. Each core // has a portion of matrices A and B (each such // portion blk_size * blk_size numbers) and // computes a portion of matrix C = A * B. // =========================================================================== // * INPUTS // int num_procs Number of processors to use from the MPI setup // int tile_size Number of core rows and columns in the 2D layout // int blk_size Number of rows and columns for each portion. // The total size of A, B and C is // tile_size ^ 2 * blk_size ^ 2 elements. // int verify 0: don't verify the multiplication // 1: core 0 gathers all matrices back and // does the verification // // * RETURN VALUE // int 0 for success // =========================================================================== int matrix_mult_mpi(int num_procs, int tile_size, int blk_size, int verify) { int num_cores; int rank; int tile_row; int tile_col; int blk_size_sq = 0; int whole_size = 0; int whole_size_sq = 0; int phase; int tag_a; int tag_b; int tag_c; MPI_Request reqs[2]; int num_reqs; MPI_Status status; float *a = NULL; float *b = NULL; float *c = NULL; int peer_rank; float *peer_a = NULL; float *peer_b = NULL; float *work_a = NULL; float *work_b = NULL; float *whole_a = NULL; float *whole_b = NULL; float *whole_c = NULL; float *peer_c = NULL; float verify_val; #ifdef ARCH_MB unsigned int time_start = 0; unsigned int time_stop; unsigned int time; #endif int i; int j; int k; // Initialize MPI //MPI_Init(NULL, NULL); // Who are we? MPI_Comm_size(MPI_COMM_WORLD, &num_cores); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Sanity checks if (num_cores < num_procs) { if (!rank) { kt_printf("Cannot run with %d cores, MPI setup has only %d cores\r\n", num_procs, num_cores); } return 1; } if (tile_size * tile_size != num_procs) { if (!rank) { kt_printf("%d * %d tiles != %d cores\r\n", tile_size, tile_size, num_procs); } return 1; } rank_to_tile(tile_size, rank, &tile_row, &tile_col); // Are we running? if (rank < num_procs) { // Create array portions blk_size_sq = blk_size * blk_size; whole_size = tile_size * blk_size; whole_size_sq = whole_size * whole_size; a = kt_malloc(blk_size_sq * sizeof(float)); b = kt_malloc(blk_size_sq * sizeof(float)); c = kt_malloc(blk_size_sq * sizeof(float)); // Initialize our portions of A and B with some values, zero out C for (i = 0; i < blk_size_sq; i++) { a[i] = rank + i; b[i] = rank - i; c[i] = 0.0; } // Create two buffers to receive peer A and B portions peer_a = kt_malloc(blk_size_sq * sizeof(float)); peer_b = kt_malloc(blk_size_sq * sizeof(float)); // Assign tags for A and B send/recvs tag_a = 42; tag_b = 666; tag_c = 3; } // Synchronize everyone if (!rank) { kt_printf("Matrix multiplication of %d x %d starting on %d core(s)\r\n", whole_size, whole_size, num_procs); } MPI_Barrier(MPI_COMM_WORLD); // Keep time if (!rank) { #ifdef ARCH_MB time_start = ar_glob_timer_read(); #endif } // If not part of active cores, skip to next barrier if (rank >= num_procs) { goto skip; } // For all the phases in the algorithm for (phase = 0; phase < tile_size; phase++) { // Remember how many waits we'll have to do num_reqs = 0; // Are we responsible to broadcast our A portion? if (tile_col == phase) { // Broadcast to others in our tile row for (i = 0; i < tile_size; i++) { if (i != tile_col) { peer_rank = tile_to_rank(tile_size, tile_row, i); MPI_Isend(a, blk_size_sq, MPI_FLOAT, peer_rank, tag_a, MPI_COMM_WORLD, NULL); } } work_a = a; } else { // Receive A portion from someone else from our tile row peer_rank = tile_to_rank(tile_size, tile_row, phase); MPI_Irecv(peer_a, blk_size_sq, MPI_FLOAT, peer_rank, tag_a, MPI_COMM_WORLD, &reqs[num_reqs++]); work_a = peer_a; } // Are we responsible to broadcast our B portion? if (tile_row == phase) { // Broadcast to others in our tile col for (i = 0; i < tile_size; i++) { if (i != tile_row) { peer_rank = tile_to_rank(tile_size, i, tile_col); MPI_Isend(b, blk_size_sq, MPI_FLOAT, peer_rank, tag_b, MPI_COMM_WORLD, NULL); } } work_b = b; } else { // Receive B portion from someone else from our tile col peer_rank = tile_to_rank(tile_size, phase, tile_col); MPI_Irecv(peer_b, blk_size_sq, MPI_FLOAT, peer_rank, tag_b, MPI_COMM_WORLD, &reqs[num_reqs++]); work_b = peer_b; } // Wait for needed receives to arrive if (num_reqs) { MPI_Waitall(num_reqs, reqs, &status); } // Add to partial results of C the A * B portion sums for (i = 0; i < blk_size; i++) { for (j = 0; j < blk_size; j++) { for (k = 0; k < blk_size; k++) { c[i * blk_size + j] += work_a[i * blk_size + k] * work_b[k * blk_size + j]; } } } } // Synchronize skip: MPI_Barrier(MPI_COMM_WORLD); // Keep time if (!rank) { #ifdef ARCH_MB time_stop = ar_glob_timer_read(); if (time_stop > time_start) { time = time_stop - time_start; } else { time = 0xFFFFFFFF - (time_start - time_stop); } kt_printf("Time: %10u cycles (%6u msec)\r\n", time, time / 10000); #endif } if (!verify) { goto finished; } // Rank 0 gathers all A, B and C's and verifies if (rank == 0) { kt_printf("Multiplication finished, rank 0 is gathering results...\r\n"); // Allocate big arrays whole_a = kt_malloc(whole_size_sq * sizeof(float)); whole_b = kt_malloc(whole_size_sq * sizeof(float)); whole_c = kt_malloc(whole_size_sq * sizeof(float)); // Allocate partial C buffer peer_c = kt_malloc(blk_size_sq * sizeof(float)); // Place my partial arrays matrix_mult_mpi_place_partial(whole_a, (float *) a, tile_size, blk_size, 0); matrix_mult_mpi_place_partial(whole_b, (float *) b, tile_size, blk_size, 0); matrix_mult_mpi_place_partial(whole_c, (float *) c, tile_size, blk_size, 0); // Gather from others for (peer_rank = 1; peer_rank < num_procs; peer_rank++) { MPI_Recv(peer_a, blk_size_sq, MPI_FLOAT, peer_rank, tag_a, MPI_COMM_WORLD, &status); MPI_Recv(peer_b, blk_size_sq, MPI_FLOAT, peer_rank, tag_b, MPI_COMM_WORLD, &status); MPI_Recv(peer_c, blk_size_sq, MPI_FLOAT, peer_rank, tag_c, MPI_COMM_WORLD, &status); matrix_mult_mpi_place_partial(whole_a, (float *) peer_a, tile_size, blk_size, peer_rank); matrix_mult_mpi_place_partial(whole_b, (float *) peer_b, tile_size, blk_size, peer_rank); matrix_mult_mpi_place_partial(whole_c, (float *) peer_c, tile_size, blk_size, peer_rank); } // Print //kt_printf("A is:\r\n"); //matrix_mult_mpi_print_matrix(whole_a, whole_size); //kt_printf("\r\nB is:\r\n"); //matrix_mult_mpi_print_matrix(whole_b, whole_size); //kt_printf("\r\nC is:\r\n"); //matrix_mult_mpi_print_matrix(whole_c, whole_size); //kt_printf("\r\n"); // Verify for (i = 0; i < whole_size; i++) { for (j = 0; j < whole_size; j++) { verify_val = 0; for (k = 0; k < whole_size; k++) { verify_val += whole_a[i * whole_size + k] * whole_b[k * whole_size + j]; } if (whole_c[i * whole_size + j] != verify_val) { kt_printf("Results gathered: " "Verification [31;1mFAILED[0m at C[%d, %d]\r\n", i, j); while (1) { ; } } } } kt_printf("Results gathered. Verification [32;1mPASSED[0m\r\n"); } else if (rank < num_procs) { // Send partial arrays to rank 0 MPI_Send(a, blk_size_sq, MPI_FLOAT, 0, tag_a, MPI_COMM_WORLD); MPI_Send(b, blk_size_sq, MPI_FLOAT, 0, tag_b, MPI_COMM_WORLD); MPI_Send(c, blk_size_sq, MPI_FLOAT, 0, tag_c, MPI_COMM_WORLD); } finished: // Free stuff if (rank < num_procs) { kt_free(a); kt_free(b); kt_free(c); kt_free(peer_a); kt_free(peer_b); if (verify) { kt_free(whole_a); kt_free(whole_b); kt_free(whole_c); kt_free(peer_c); } } return 0; }
int main(int argc, char *argv[]) { int rank; int n_ranks, start_rank; int i,j; float gamma = 0.25, rho = -0.495266; float GLOB_SUM = 0, sum = 0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &n_ranks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("before get data in id %d\n", rank); get_data(rank%4); start_rank = 6; n_ranks = 4; printf("done getting dat rank %d\n", rank); MPI_Barrier(MPI_COMM_WORLD); // printf("crossing bar1 %d\n", rank); for (j = 0; j < INPUT_SIZE; ++j) { get_input(rank, start_rank, n_ranks); sum = compute_svm_sum(rank%4, gamma); if(rank == start_rank) { float tempBuff; GLOB_SUM = sum; for (i = start_rank+1; i < start_rank + n_ranks; ++i) { MPI_Recv(&tempBuff, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); GLOB_SUM = GLOB_SUM + tempBuff; } GLOB_SUM -= rho; } else { MPI_Send((float*)&sum, 1, MPI_FLOAT, start_rank, 0, MPI_COMM_WORLD); } } //if(rank != 6) //printf("before bar2 %d\n", rank); MPI_Barrier(MPI_COMM_WORLD); if(rank == 6) { #ifdef DUMP m5_dump_stats(0, 0); m5_reset_stats(0, 0); #endif } //printf("done with thread %d\n", rank); if(rank == 6) printf("global sum = %f\n", GLOB_SUM); // free_data(); MPI_Finalize(); return 0; }
PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec) { KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data; MPI_Comm comm; PetscScalar *Qloc = agmres->Qloc; PetscScalar *sgn = agmres->sgn; PetscScalar *tloc = agmres->tloc; PetscErrorCode ierr; PetscReal *wbufptr = agmres->wbufptr; PetscMPIInt rank = agmres->rank; PetscMPIInt First = agmres->First; PetscMPIInt Last = agmres->Last; PetscBLASInt pas,len,bnloc,bpos; PetscInt nloc,d, i, j, k; PetscInt pos; PetscReal c, s, rho, Ajj, val, tt, old; PetscScalar *col; MPI_Status status; PetscBLASInt N = MAXKSPSIZE + 1; PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)ksp,&comm);CHKERRQ(ierr); ierr = PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);CHKERRQ(ierr); ierr = PetscMemzero(agmres->Rloc, N*N*sizeof(PetscScalar));CHKERRQ(ierr); /* check input arguments */ if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive"); ierr = VecGetLocalSize(VEC_V(0), &nloc);CHKERRQ(ierr); ierr = PetscBLASIntCast(nloc,&bnloc);CHKERRQ(ierr); if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns"); pas = 1; /* Copy the vectors of the basis */ for (j = 0; j < nvec; j++) { ierr = VecGetArray(VEC_V(j), &col);CHKERRQ(ierr); PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas)); ierr = VecRestoreArray(VEC_V(j), &col);CHKERRQ(ierr); } /* Each process performs a local QR on its own block */ for (j = 0; j < nvec; j++) { len = nloc - j; Ajj = Qloc[j*nloc+j]; rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas); if (rho == 0.0) tloc[j] = 0.0; else { tloc[j] = (Ajj - rho) / rho; len = len - 1; val = 1.0 / (Ajj - rho); PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas)); Qloc[j*nloc+j] = 1.0; len = len + 1; for (k = j + 1; k < nvec; k++) { PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas)); PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas)); } Qloc[j*nloc+j] = rho; } } /*annihilate undesirable Rloc, diagonal by diagonal*/ for (d = 0; d < nvec; d++) { len = nvec - d; if (rank == First) { PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas)); ierr = MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);CHKERRQ(ierr); } else { ierr = MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);CHKERRQ(ierr); /*Elimination of Rloc(1,d)*/ c = wbufptr[d]; s = Qloc[d*nloc]; ierr = KSPAGMRESRoddecGivens(&c, &s, &rho, 1);CHKERRQ(ierr); /*Apply Givens Rotation*/ for (k = d; k < nvec; k++) { old = wbufptr[k]; wbufptr[k] = c * old - s * Qloc[k*nloc]; Qloc[k*nloc] = s * old + c * Qloc[k*nloc]; } Qloc[d*nloc] = rho; if (rank != Last) { ierr = MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);CHKERRQ(ierr); } /* zero-out the d-th diagonal of Rloc ...*/ for (j = d + 1; j < nvec; j++) { /* elimination of Rloc[i][j]*/ i = j - d; c = Qloc[j*nloc+i-1]; s = Qloc[j*nloc+i]; ierr = KSPAGMRESRoddecGivens(&c, &s, &rho, 1);CHKERRQ(ierr); for (k = j; k < nvec; k++) { old = Qloc[k*nloc+i-1]; Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i]; Qloc[k*nloc+i] = s * old + c * Qloc[k*nloc+i]; } Qloc[j*nloc+i] = rho; } if (rank == Last) { PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N)); for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0; } } } if (rank == Last) { for (d = 0; d < nvec; d++) { pos = nvec - d; ierr = PetscBLASIntCast(pos,&bpos);CHKERRQ(ierr); sgn[d] = PetscSign(*RLOC(d,d)); PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N)); } } /*BroadCast Rloc to all other processes * NWD : should not be needed */ ierr = MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);CHKERRQ(ierr); ierr = PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); }
void mpi_distribute(int Mx){ if ( taskid == MASTER ) { averow = Mx/numworkers; extra = Mx%numworkers; offset = 0; for ( rank=1; rank <= (numworkers); rank++) { rows = (rank <= extra) ? averow+1 : averow; left_node = rank - 1; right_node = rank + 1; if ( rank == 1 ) { left_node = NONE; } if ( rank == (numworkers) ) { right_node = NONE; } dest = rank; MPI_Send(&offset, 1, MPI_INT, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&left_node, 1, MPI_INT, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&right_node, 1, MPI_INT, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&phi_old[offset*Mx], rows*Mx, MPI_DOUBLE, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&mu_old[offset*Mx], rows*Mx, MPI_DOUBLE, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&u_old[offset*Mx], rows*Mx, MPI_DOUBLE, dest, BEGIN, MPI_COMM_WORLD); MPI_Send(&v_old[offset*Mx], rows*Mx, MPI_DOUBLE, dest, BEGIN, MPI_COMM_WORLD); offset = offset + rows; } }else{ source = MASTER; MPI_Recv(&offset, 1, MPI_INT, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&left_node, 1, MPI_INT, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&right_node, 1, MPI_INT, source, BEGIN, MPI_COMM_WORLD, &status); start = 1; if((taskid ==1) || (taskid == numworkers)) { if(taskid == 1) { MPI_Recv(&phi_old[0], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&mu_old[0], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); #ifdef FLUID MPI_Recv(&u_old[0], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&v_old[0], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); #endif } else { MPI_Recv(&phi_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&mu_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); #ifdef FLUID MPI_Recv(&u_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&v_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); #endif } end = rows-1; } else { MPI_Recv(&phi_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&mu_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); #ifdef FLUID MPI_Recv(&u_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); MPI_Recv(&v_old[Mx], rows*Mx, MPI_DOUBLE, source, BEGIN, MPI_COMM_WORLD, &status); #endif end = rows; } } }
void OUTPUT_ADAM_STATS(ElementsHashTable* El_Table, MatProps* matprops_ptr, TimeProps* timeprops_ptr, StatProps* statprops_ptr) { int myid, numprocs; IF_MPI(MPI_Status status); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); double velocity2 = 0.0; double vmax = 0, hmax = 0; double xy_cen[2] = { 0.0, 0.0 }, vh_cen[2] = { 0.0, 0.0 }; double xyh_vmax[3] = { 0.0, 0.0, 0.0 }; double xyv_hmax[3] = { 0.0, 0.0, 0.0 }; double masscenterdist2 = 0.0, masscentermindist2 = HUGE_VAL, xycen[2] = { 0.0, 0.0 }; double vmax_min_height = matprops_ptr->scale.max_negligible_height * 512.0 * ADAM_HEIGHT_FRAC; int i; struct { //for use with MPI_MAXLOC double val; int rank; } send, receive; xy_cen[0] = statprops_ptr->xcen / (matprops_ptr->scale.length); xy_cen[1] = statprops_ptr->ycen / (matprops_ptr->scale.length); double VxVy[2]; int no_of_buckets = El_Table->get_no_of_buckets(); vector<HashEntryLine> &bucket=El_Table->bucket; tivector<Element> &elenode_=El_Table->elenode_; //@ElementsBucketDoubleLoop for(int ibuck = 0; ibuck < no_of_buckets; ibuck++) { for(int ielm = 0; ielm < bucket[ibuck].ndx.size(); ielm++) { Element* EmTemp = &(elenode_[bucket[ibuck].ndx[ielm]]); if(EmTemp->adapted_flag() > 0) { double height=EmTemp->state_vars(0); EmTemp->eval_velocity(0.0, 0.0, VxVy); velocity2 = VxVy[0] * VxVy[0] + VxVy[1] * VxVy[1]; //get v and h at center of mass masscenterdist2 = (EmTemp->coord(0) - xy_cen[0]) * (EmTemp->coord(0) - xy_cen[0]) + (EmTemp->coord(1) - xy_cen[1]) * (EmTemp->coord(1) - xy_cen[1]); if(masscenterdist2 < masscentermindist2) { masscentermindist2 = masscenterdist2; vh_cen[0] = velocity2; vh_cen[1] = height; xycen[0] = EmTemp->coord(0); xycen[1] = EmTemp->coord(1); } //eliminate fast moving very thin pile from consideration if(height >= vmax_min_height) { if(velocity2 > vmax) { /* velocity2 is not a mistake... only need to take the root of the maximum value */ vmax = velocity2; xyh_vmax[0] = EmTemp->coord(0); xyh_vmax[1] = EmTemp->coord(1); xyh_vmax[2] = height; } } if(height > hmax) { hmax = height; xyv_hmax[0] = EmTemp->coord(0); xyv_hmax[1] = EmTemp->coord(1); xyv_hmax[2] = velocity2; } } } } vh_cen[0] = sqrt(vh_cen[0]); vmax = sqrt(vmax); xyv_hmax[2] = sqrt(xyv_hmax[2]); /* get the max value accross all processors */ #ifdef USE_MPI if(numprocs > 1) { send.rank = myid; //at center of mass send.val = masscentermindist2; MPI_Allreduce(&send, &receive, 1, MPI_DOUBLE_INT, MPI_MINLOC, MPI_COMM_WORLD); if(receive.rank != 0) { /* don't send location if it's already on the root processor */ if(receive.rank == myid) MPI_Send(vh_cen, 2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); else if(myid == 0) MPI_Recv(vh_cen, 2, MPI_DOUBLE, receive.rank, 0, MPI_COMM_WORLD, &status); } //at location of vmax send.val = vmax; MPI_Allreduce(&send, &receive, 1, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD); vmax = receive.val; if(receive.rank != 0) { /* don't send location if it's already on the root processor */ if(receive.rank == myid) MPI_Send(xyh_vmax, 3, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); else if(myid == 0) MPI_Recv(xyh_vmax, 3, MPI_DOUBLE, receive.rank, 0, MPI_COMM_WORLD, &status); } //at location of hmax send.val = hmax; MPI_Allreduce(&send, &receive, 1, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD); hmax = receive.val; if(receive.rank != 0) { /* don't send location if it's already on the root processor */ if(receive.rank == myid) MPI_Send(xyv_hmax, 3, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD); else if(myid == 0) MPI_Recv(xyv_hmax, 3, MPI_DOUBLE, receive.rank, 0, MPI_COMM_WORLD, &status); } } #endif //USE_MPI if(myid == 0) { FILE *fp = fopen("flow_dynamics.stats", "a"); fprintf(fp, "%16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g, %16.10g\n", timeprops_ptr->timesec(), //time in seconds statprops_ptr->vmean, //average velocity //x,y,v,h at center of mass statprops_ptr->xcen, statprops_ptr->ycen, vh_cen[0] * sqrt(matprops_ptr->scale.length * (matprops_ptr->scale.gravity)), vh_cen[1] * (matprops_ptr->scale.height), //x,y,v,h at location of vmax xyh_vmax[0] * matprops_ptr->scale.length, xyh_vmax[1] * matprops_ptr->scale.length, vmax * sqrt(matprops_ptr->scale.length * (matprops_ptr->scale.gravity)), xyh_vmax[2] * (matprops_ptr->scale.height), //x,y,v,h at location of hmax xyv_hmax[0] * matprops_ptr->scale.length, xyv_hmax[1] * matprops_ptr->scale.length, xyv_hmax[2] * sqrt(matprops_ptr->scale.length * (matprops_ptr->scale.gravity)), hmax * (matprops_ptr->scale.height)); fclose(fp); } return; }
int main(int argc, char* argv[]) { std::chrono::time_point<std::chrono::high_resolution_clock> tStart; std::chrono::time_point<std::chrono::high_resolution_clock> tStop; typedef std::chrono::duration<int,std::milli> millisecs_t ; int numprocs, rank, edge, pixel_count, start, end; double max_values_sq; Uint32 max_iter; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if(numprocs <= 1) { std::cerr << argv[0] << ": error: requires at least two MPI processes\n"; return 1; } max_values_sq = 4.0; max_iter = 5000; edge = (MAX_X * MAX_Y) / (numprocs - 1); if(rank > 0) { int tile = rank - 1; Uint32* pixels; start = tile * edge; end = (tile == numprocs - 2) ? MAX_X * MAX_Y : (tile + 1) * edge; pixel_count = end - start; pixels = (Uint32*) malloc(pixel_count * sizeof(Uint32)); calc_lines(start, end, pixels, max_values_sq, max_iter); MPI_Send((void*)pixels, pixel_count, MPI_INT, 0, 0, MPI_COMM_WORLD); free(pixels); } else /* rank == 0 */ { int tile, recv_count = (edge + 1); char title[100]; Uint32* field = (Uint32*) malloc(MAX_X * MAX_Y * sizeof(Uint32)); Uint32* fieldpos; SDL_Surface* sdlSurface; SDL_Event event; MPI_Status status; tStart = std::chrono::high_resolution_clock::now(); for(tile = 1; tile < numprocs; tile++) { start = (tile - 1) * edge; end = (tile == numprocs - 1) ? MAX_X * MAX_Y : tile * edge; pixel_count = end - start; recv_count = pixel_count; fieldpos = field+start; MPI_Recv(fieldpos, recv_count, MPI_INT, tile, MPI_ANY_TAG, MPI_COMM_WORLD, &status); } tStop = std::chrono::high_resolution_clock::now(); millisecs_t duration( std::chrono::duration_cast<millisecs_t>(tStop-tStart) ) ; long elapsed = duration.count(); SDL_Init(SDL_INIT_EVERYTHING); sdlSurface = SDL_SetVideoMode(MAX_X, MAX_Y, 32, SDL_HWSURFACE | SDL_DOUBLEBUF); std::stringstream ss; ss << argv[0] << " " << numprocs << " processes " << elapsed*1.e-3 << " sec." << "\n"; SDL_WM_SetCaption(ss.str().c_str(), title); std::cout << ss.str().c_str() << "\n"; draw(sdlSurface, field); SDL_Flip(sdlSurface); do { SDL_Delay(50); SDL_PollEvent(&event); } while( event.type != SDL_QUIT && event.type != SDL_KEYDOWN ); SDL_FreeSurface(sdlSurface); SDL_Quit(); free(field); } MPI_Finalize(); return 0; }
void ghost_communicator(GhostCommunicator *gc) { MPI_Status status; int n, n2; int data_parts = gc->data_parts; GHOST_TRACE(fprintf(stderr, "%d: ghost_comm %p, data_parts %d\n", this_node, gc, data_parts)); for (n = 0; n < gc->num; n++) { GhostCommunication *gcn = &gc->comm[n]; int comm_type = gcn->type & GHOST_JOBMASK; int prefetch = gcn->type & GHOST_PREFETCH; int poststore = gcn->type & GHOST_PSTSTORE; int node = gcn->node; GHOST_TRACE(fprintf(stderr, "%d: ghost_comm round %d, job %x\n", this_node, n, gc->comm[n].type)); GHOST_TRACE(fprintf(stderr, "%d: ghost_comm shift %f %f %f\n",this_node, gc->comm[n].shift[0], gc->comm[n].shift[1], gc->comm[n].shift[2])); if (comm_type == GHOST_LOCL) cell_cell_transfer(gcn, data_parts); else { /* prepare send buffer if necessary */ if (is_send_op(comm_type, node)) { /* ok, we send this step, prepare send buffer if not yet done */ if (!prefetch) prepare_send_buffer(gcn, data_parts); else { GHOST_TRACE(fprintf(stderr, "%d: ghost_comm using prefetched data for operation %d, sending to %d\n", this_node, n, node)); #ifdef ADDITIONAL_CHECKS if (n_s_buffer != calc_transmit_size(gcn, data_parts)) { fprintf(stderr, "%d: ghost_comm transmission size and current size of cells to transmit do not match\n", this_node); errexit(); } #endif } } else { /* we do not send this time, let's look for a prefetch */ if (prefetch) { /* find next action where we send and which has PREFETCH set */ for (n2 = n+1; n2 < gc->num; n2++) { GhostCommunication *gcn2 = &gc->comm[n2]; int comm_type2 = gcn2->type & GHOST_JOBMASK; int prefetch2 = gcn2->type & GHOST_PREFETCH; int node2 = gcn2->node; if (is_send_op(comm_type2, node2) && prefetch2) { GHOST_TRACE(fprintf(stderr, "%d: ghost_comm prefetch operation %d, is send/bcast to/from %d\n", this_node, n2, node2)); prepare_send_buffer(gcn2, data_parts); break; } } } } /* recv buffer for recv and multinode operations to this node */ if (is_recv_op(comm_type, node)) prepare_recv_buffer(gcn, data_parts); /* transfer data */ switch (comm_type) { case GHOST_RECV: GHOST_TRACE(fprintf(stderr, "%d: ghost_comm receive from %d (%d bytes)\n", this_node, node, n_r_buffer)); MPI_Recv(r_buffer, n_r_buffer, MPI_BYTE, node, REQ_GHOST_SEND, MPI_COMM_WORLD, &status); break; case GHOST_SEND: GHOST_TRACE(fprintf(stderr, "%d: ghost_comm send to %d (%d bytes)\n", this_node, node, n_s_buffer)); MPI_Send(s_buffer, n_s_buffer, MPI_BYTE, node, REQ_GHOST_SEND, MPI_COMM_WORLD); break; case GHOST_BCST: GHOST_TRACE(fprintf(stderr, "%d: ghost_comm bcast from %d (%d bytes)\n", this_node, node, (node == this_node) ? n_s_buffer : n_r_buffer)); if (node == this_node) MPI_Bcast(s_buffer, n_s_buffer, MPI_BYTE, node, MPI_COMM_WORLD); else MPI_Bcast(r_buffer, n_r_buffer, MPI_BYTE, node, MPI_COMM_WORLD); break; case GHOST_RDCE: GHOST_TRACE(fprintf(stderr, "%d: ghost_comm reduce to %d (%d bytes)\n", this_node, node, n_s_buffer)); if (node == this_node) MPI_Reduce(s_buffer, r_buffer, n_s_buffer, MPI_BYTE, MPI_FORCES_SUM, node, MPI_COMM_WORLD); else MPI_Reduce(s_buffer, NULL, n_s_buffer, MPI_BYTE, MPI_FORCES_SUM, node, MPI_COMM_WORLD); break; } GHOST_TRACE(MPI_Barrier(MPI_COMM_WORLD)); GHOST_TRACE(fprintf(stderr, "%d: ghost_comm done\n", this_node)); /* recv op; write back data directly, if no PSTSTORE delay is requested. */ if (is_recv_op(comm_type, node)) { if (!poststore) { /* forces have to be added, the rest overwritten. Exception is RDCE, where the addition is integrated into the communication. */ if (data_parts == GHOSTTRANS_FORCE && comm_type != GHOST_RDCE) add_forces_from_recv_buffer(gcn); else put_recv_buffer(gcn, data_parts); } else { GHOST_TRACE(fprintf(stderr, "%d: ghost_comm delaying operation %d, recv from %d\n", this_node, n, node)); } } else { /* send op; write back delayed data from last recv, when this was a prefetch send. */ if (poststore) { /* find previous action where we recv and which has PSTSTORE set */ for (n2 = n-1; n2 >= 0; n2--) { GhostCommunication *gcn2 = &gc->comm[n2]; int comm_type2 = gcn2->type & GHOST_JOBMASK; int poststore2 = gcn2->type & GHOST_PSTSTORE; int node2 = gcn2->node; if (is_recv_op(comm_type2, node2) && poststore2) { GHOST_TRACE(fprintf(stderr, "%d: ghost_comm storing delayed recv, operation %d, from %d\n", this_node, n2, node2)); #ifdef ADDITIONAL_CHECKS if (n_r_buffer != calc_transmit_size(gcn2, data_parts)) { fprintf(stderr, "%d: ghost_comm transmission size and current size of cells to transmit do not match\n", this_node); errexit(); } #endif /* as above */ if (data_parts == GHOSTTRANS_FORCE && comm_type != GHOST_RDCE) add_forces_from_recv_buffer(gcn2); else put_recv_buffer(gcn2, data_parts); break; } } } } } } }
int main(int argc, char* argv[]) { #ifdef BENCHMARKING benchmark(argc, argv); #else // mpi setup int numProcs; int rank, flag; int done = 0; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numProcs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // create a buffer for both worker and controller static double buffer[BUFFER_SIZE]; unsigned int niter = argc > 1 ? atoi(argv[1]) : NITER; // Setting up the PSF (statically) int psfWidth, psfHeight; double* psf = ImageQueue::getPsf(&psfWidth, &psfHeight); // ---------- CONTROLLER NODE ---------- // if (rank == 0) { // Set up producer ImageQueue images(buffer, BUFFER_SIZE, "../images", numProcs); // Print out some details int numImages = images.remaining(); FPRINT("Starting %d iteration(s) on %d image(s)", niter, numImages); PerfTimer mainTimer; mainTimer.begin(); int toSend = (unsigned int)numProcs < images.remaining() ? numProcs : images.remaining(); for (int i = 0; i < toSend; i++) { images.pop(i); MPI_Send(buffer, BUFFER_SIZE, MPI_DOUBLE, i, IMG, MPI_COMM_WORLD); } while (images.remaining() > 0) { for (int i = 0; i < numProcs; i++) { // If an image is received then save it and send the next one MPI_Iprobe(i, IMG, MPI_COMM_WORLD, &flag, &status); if (flag) { MPI_Recv(buffer, BUFFER_SIZE, MPI_DOUBLE, i, IMG, MPI_COMM_WORLD, &status); images.save(i); images.pop(i); MPI_Send(buffer, BUFFER_SIZE, MPI_DOUBLE, i, IMG, MPI_COMM_WORLD); } } } for (int i = 0; i < numProcs; i++) { MPI_Send(&done, 1, MPI_INT, i, END, MPI_COMM_WORLD); } FPRINT("Finished %d image(s) in %f seconds", numImages, mainTimer.getElapsed()); } // ---------- WORKER NODE ---------- // else { // worker thread // Set up consumer DeconvFilter filter(WIDTH, HEIGHT, niter, psf, psfWidth, psfHeight, buffer); bool running = true; PRINT("Worker thread initialised."); while (running) { MPI_Iprobe(0, IMG, MPI_COMM_WORLD, &flag, &status); if (flag) { // New image MPI_Recv(buffer, BUFFER_SIZE, MPI_DOUBLE, 0, IMG, MPI_COMM_WORLD, &status); filter.process(); MPI_Send(buffer, BUFFER_SIZE, MPI_DOUBLE, 0, IMG, MPI_COMM_WORLD); } MPI_Iprobe(0, END, MPI_COMM_WORLD, &flag, &status); if (flag) { // Execution finished MPI_Recv(&done, 1, MPI_INT, 0, END, MPI_COMM_WORLD, &status); running = false; } } PRINT("Worker thread finished."); } MPI_Finalize(); #endif return 0; }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); double a[ISIZE][JSIZE]; int myRank; int numProcesses; MPI_Comm_rank(MPI_COMM_WORLD, &myRank); MPI_Comm_size(MPI_COMM_WORLD, &numProcesses); if (myRank == 0) { // Initialization for (int i = 0; i < ISIZE; ++i) { for (int j = 0; j < JSIZE; ++j) { a[i][j] = 10 * i + j; } } } double timeElapsedMs; MEASURE_TIME_MS_BEGIN(timeElapsedMs); // Decide which process is responsible for which part of data int segmentsLength[numProcesses]; int segmentsStart[numProcesses]; int segmentsEnd[numProcesses]; int base = JSIZE / numProcesses; int rem = JSIZE % numProcesses; for (int p = 0; p < numProcesses; ++p) { if (p < rem) { segmentsStart[p] = p * (base + 1); segmentsEnd[p] = segmentsStart[p] + base; } else { segmentsStart[p] = rem * (base + 1) + (p - rem) * base; segmentsEnd[p] = segmentsStart[p] + base - 1; } segmentsLength[p] = segmentsEnd[p] - segmentsStart[p] + 1; } double dataSplitted[JSIZE][ISIZE]; if (myRank == 0) { for (int j0 = 0; j0 < JSIZE; ++j0) { for (int i = 0, j = j0; i < ISIZE; ++i, j = (j + JSIZE - 1) % JSIZE) { dataSplitted[j0][i] = a[i][j]; } } } if (myRank == 0) { // Send splitted data to processes for (int p = 1; p < numProcesses; p++) { MPI_Send(dataSplitted[segmentsStart[p]], ISIZE * segmentsLength[p], MPI_DOUBLE, p, TAG_SEND_DATA, MPI_COMM_WORLD); } } else { // Recevive data from master process MPI_Status status; MPI_Recv(dataSplitted[segmentsStart[myRank]], ISIZE * segmentsLength[myRank], MPI_DOUBLE, 0, TAG_SEND_DATA, MPI_COMM_WORLD, &status); } // Compute partial results for (int j0 = segmentsStart[myRank]; j0 <= segmentsEnd[myRank]; ++j0) { for (int i = 0, j = j0; i < ISIZE; ++i, j = (j + JSIZE - 1) % JSIZE) { if ((j < JSIZE - 1) && (i > 0)) { for (int k = 0; k < KSIZE; ++k) { dataSplitted[j0][i] = sin(0.01 * dataSplitted[j0][i - 1]); } } } } if (myRank > 0) { // Send data back to the master MPI_Send(dataSplitted[segmentsStart[myRank]], ISIZE * segmentsLength[myRank], MPI_DOUBLE, 0, TAG_COLLECT_RESULTS, MPI_COMM_WORLD); } else { // Collect data from the processes for (int p = 1; p < numProcesses; ++p) { MPI_Status status; MPI_Recv(dataSplitted[segmentsStart[p]], ISIZE * segmentsLength[p], MPI_DOUBLE, p, TAG_COLLECT_RESULTS, MPI_COMM_WORLD, &status); } } if (myRank == 0) { // Put results back into matrix, output time and print results for (int j0 = 0; j0 < JSIZE; ++j0) { for (int i = 0, j = j0; i < JSIZE; ++i, j = (j + JSIZE - 1) % JSIZE) { a[i][j] = dataSplitted[j0][i]; } } MEASURE_TIME_MS_END(timeElapsedMs); printf("%.6lf\n", timeElapsedMs); FILE *ff = fopen("parallel_2.out", "w"); for (int i = 0; i < ISIZE; ++i) { for (int j = 0; j < JSIZE; ++j) { fprintf(ff, "%f ", a[i][j]); } fprintf(ff, "\n"); } fclose(ff); } //printf("Process %d, segmentsBegin: %d, segmentsEnd: %d\n", myRank, segmentsStart[myRank], // segmentsEnd[myRank]); MPI_Finalize(); return 0; }
/* ============================================================================ function MainProcess: seqNum: Number of sequences in string sequences (ex. 3) sequences: holds the sequences. (ex. [GTGCAACGTACT]) seqLen: Array of the length of each sequence in sequences. (ex. 5,4,3) ======= which means that we have three sequences GTGCA, ACGT, and ACT. stype: Scoring type (1: linear score, 2: PAM250 if protein, 3: BLOSUM if protein) partitionSize: Partion Size ============================================================================== */ void MainProcess (MOATypeDimn seqNum, char * * sequences, char * * seqName, MOATypeShape * seqLen, int stype, long partitionSize) { ProcessData * pData = NULL; ScoringData * sData = NULL; WavesData * wData = NULL; MOATypeDimn k; MOATypeInd i; int ret, startflag; struct rusage usageRec; double utime, stime; MPI_Status status; double t_start, t_finish; char command[2000]; #ifndef NDEBUG char msg[SHORT_MESSAGE_SIZE]; int dbglevel = 0; MOATypeInd j; #endif t_start = MPI_Wtime(); /* print the input arguments ============================================*/ //PrintSequencies (0, seqNum, sequences, seqLen); #ifndef NDEBUG sprintf(msg, ">>>>MainProcess: Scoring Type: %d\n>>>>Partition Size: %ld\n", stype, partitionSize); mprintf(dbglevel, msg, 1); #endif /* Initialize Process Memory pData (function located in partitioning.c*/ ret = initProcessMemory(&pData, &sData, &wData, seqNum, seqLen, sequences, seqName, stype, partitionSize); if (ret != 0) { mprintf (0, ">>>>MainProcess: Error Initializing Process Data, Exiting\n", 1); fflush (stdout); return; } /* if restore previouse run read check point data here, do not calculate waves */ pData->OCout = NULL; pData->OCin = NULL; if (Mode != Distributed) { /* Construct MOA record */ pData->msaAlgn = NULL; createMOAStruct(&pData->msaAlgn); if (createMOA(seqLen, seqNum, pData->msaAlgn, 0, 0) < 0) return; wData->wavesTotal = 1; wData->AllpartsInWave = mmalloc((MOATypeInd) sizeof *wData->AllpartsInWave); wData->AllpartsInWave[0] = 1; pData->waveNo = 0; pData->partNo = 0; pData->partitionsCount = 1; /*pData->OCout = mmalloc((MOATypeInd) sizeof *(pData->OCout)); if (pData->OCout == NULL) { mprintf(1, "Couldn't create memory for OCout while adding an OC. Exiting.\n", 3); printf("Couldn't create memory for OCout while . Exiting.\n",); return; } pData->OCout[0].wavesOC = 0; pData->OCout[0].WOCO = NULL; */ #ifndef NDEBUG sprintf (msg, "[%d]>ScoreCompThread[%ld]: Will call ComputePartitionScores\n", myProcid, pData->computedPartitions); mprintf (dbglevel, msg, 1); #endif /* Compute Scored for Current Partition*/ if (Algorithm == DP) DPComputeScores (pData, sData, wData); else if (Algorithm == SP) DPComputeScores (pData, sData, wData); /* Print elements ======================================================= */ //printMOA_scr(pData->msaAlgn, 0); /* Print Indexes ========================================================*/ //printMOA_scr(pData->msaAlgn, 1); pData->computedPartitions ++; checkPoint (pData, sData); } else { if (RestoreFlag == 1) { /* restore data */ restoreCheckPoint (pData, wData); pData->globalWaveNo = pData->waveNo; } else { pData->partNo = 0; pData->waveNo = 0; if (myProcid == 0) { printf ("[%d] Calculating waves and partitions .... ", myProcid); calcWaves (pData, wData); currNow = getTime(); printf("[%d] Done Calculating waves and partitions time (%d, %d, %d, %d)\n", myProcid, currNow->tm_yday, currNow->tm_hour, currNow->tm_min, currNow->tm_sec); fflush(stdout); if( checkPointWavesCalculations (pData, wData) == 0) { startflag = 1; for (i=1; i<ClusterSize; i++) MPI_Send(&startflag, 1, MPI_INT, i, 0, MOAMSA_COMM_WORLD); } else { printf ("[%d]Couldn't write Waves calculations, Exiting\n", myProcid); return; } } else { //printf ("[%d] waiting for start flag\n", myProcid); MPI_Recv(&startflag, 1, MPI_INT, 0, 0, MOAMSA_COMM_WORLD, &status); //printf ("[%d] received start flag = %d\n", myProcid, startflag); printf ("[%d] Reading waves and partitions .... ", myProcid); if (restoreWavesCalculations(pData, wData) != 0) { printf ("[%d]Couldn't read Waves calculations, Exiting\n", myProcid); return; } printf ("done.\n"); fflush(stdout); } } #ifndef NDEBUG sprintf(msg, "[%d]>MainProcess: Current Wave: %ld - Current Partition: %ld - Total Partitions in Process: %ld\n", myProcid, pData->waveNo, pData->partNo, pData->partitionsCount); mprintf(dbglevel, msg, 1); #endif ScoreCompThread (pData, sData, wData); } if (myProcid == 0) { t_finish = MPI_Wtime(); /* Getting Process Resources Usage ===================== */ ret = getrusage(RUSAGE_SELF, &usageRec); if (ret == 0) { //printf ("[%d]Resources Usage: UTime %ld, STime %ld, Mem %ld, Virt %ld\n", myProcid, usageRec.ru_utime.tv_sec, usageRec.ru_stime.tv_sec, usageRec.ru_maxrss, usageRec.ru_ixrss); utime = (double) usageRec.ru_utime.tv_sec + 1.e-6 * (double) usageRec.ru_utime.tv_usec; stime = (double) usageRec.ru_stime.tv_sec + 1.e-6 * (double) usageRec.ru_stime.tv_usec; //printf ("[%d]Resources Usage: UTime %f, STime %f\n", myProcid, utime, stime); } else printf ("[%d]Failed to retrieve Process Resources Usage, errno %d\n", myProcid, errno); //struct mallinfo info; //info = mallinfo(); //printf("[%d] STime\tUTime\theap\tMemory\t\n",myProcid); //printf("[%d] %f\t%f\t%d\t%d\n", myProcid, stime, utime, info.arena, info.usmblks + info.uordblks); printf("STime\tUTime\n"); printf("%f\t%f\n", stime, utime); printf ("Elsp-time: %f\n", t_finish - t_start); fflush(stdout); sprintf (command, "prstat 1 1 > /export/home/mhelal1/thesis/exp/run/prstatus/prst_%s", outputfilename); i = system (command); } /* Free allocated memory and exit routine ===================== */ freeProcessMemory (&pData, &sData, &wData); }
int main (int argc, char **argv) { MPI_Init (&argc, &argv); GetPot cl (argc, argv); if (cl.search (2, "-h", "--help")) { std::cerr << help_text << std::endl; return 0; } const double a = cl.follow (double (0.0), "-a"); const double b = cl.follow (double (1.0), "-b"); const unsigned int nnodes = cl.follow (100, 2, "-n", "--nnodes"); const unsigned int nel = nnodes - 1; const std::string diffusion = cl.follow ("1.0", 2, "-d", "--diffusion"); const std::string forcing = cl.follow ("1.0", 2, "-f", "--forcing"); const double L = b - a; constexpr double tol = 1e-6; constexpr unsigned int maxit = 100; constexpr unsigned int overlap = 100; MPI_Status status; int mpi_size, mpi_rank, tag; MPI_Comm_size (MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank (MPI_COMM_WORLD, &mpi_rank); const double L_loc = L / double(mpi_size); const double h = L_loc / ceil (double(nel) / double(mpi_size)); double a_loc = .0; double lval = .0; double b_loc = .0; double rval = .0; double buffer = .0; unsigned int nel_loc = 0; unsigned int ndof_loc = 1; fem_1d<double> *subproblems; coeff<double> a_coeff (diffusion); coeff<double> f_coeff (forcing); a_loc = a + mpi_rank * L_loc; b_loc = a_loc + L_loc; nel_loc = ceil (double(nel) / double(mpi_size)); if (mpi_rank > 0) { a_loc -= overlap * h; nel_loc += overlap; } if (mpi_rank < mpi_size - 1) { b_loc += overlap * h; nel_loc += overlap; } ndof_loc = nel_loc + 1; subproblems = new fem_1d<double> (new mesh<double> (a_loc, b_loc, ndof_loc)); subproblems->set_diffusion_coefficient (a_coeff); subproblems->set_source_coefficient (f_coeff); subproblems->assemble (); subproblems->set_dirichlet (fem_1d<double>::left_boundary, 0.0); subproblems->set_dirichlet (fem_1d<double>::right_boundary, 0.0); subproblems->solve (); for (unsigned int it = 0; it < maxit; ++it) { // With the following implementation // communication will occur sequentailly // left to right first then right to left // Receive from left neighbour if (mpi_rank > 0) { std::cerr << "rank " << mpi_rank << " receiving lval from rank " << mpi_rank - 1 << std::endl; MPI_Recv (&buffer, 1, MPI_DOUBLE, mpi_rank - 1, MPI_ANY_TAG, MPI_COMM_WORLD, &status); std::cerr << "rank " << mpi_rank << " received lval from rank " << mpi_rank - 1 << std::endl; lval = buffer; } tag = 10*mpi_rank; // Send to right neighbour if (mpi_rank < mpi_size - 1) { buffer = subproblems->result () [ndof_loc - 1 - 2*overlap]; std::cerr << "rank " << mpi_rank << " sending lval to rank " << mpi_rank + 1 << std::endl; MPI_Send (&buffer, 1, MPI_DOUBLE, mpi_rank + 1, tag, MPI_COMM_WORLD); std::cerr << "rank " << mpi_rank << " sent lval to rank " << mpi_rank + 1 << std::endl; } // Receive from right neighbour if (mpi_rank < mpi_size - 1) { std::cerr << "rank " << mpi_rank << " receiving rval from rank " << mpi_rank + 1 << std::endl; MPI_Recv (&buffer, 1, MPI_DOUBLE, mpi_rank + 1, MPI_ANY_TAG, MPI_COMM_WORLD, &status); std::cerr << "rank " << mpi_rank << " received rval from rank " << mpi_rank + 1 << std::endl; rval = buffer; } tag = 10*mpi_rank + 1; // Send to right neighbour if (mpi_rank > 0) { buffer = subproblems->result () [2*overlap]; std::cerr << "rank " << mpi_rank << " sending rval to rank " << mpi_rank - 1 << std::endl; MPI_Send (&buffer, 1, MPI_DOUBLE, mpi_rank - 1, tag, MPI_COMM_WORLD); std::cerr << "rank " << mpi_rank << " sent rval to rank " << mpi_rank - 1 << std::endl; } subproblems->set_dirichlet (fem_1d<double>::left_boundary, lval); subproblems->set_dirichlet (fem_1d<double>::right_boundary, rval); subproblems->solve (); } for (int rank = 0; rank < mpi_size; ++rank) { if (rank == mpi_rank) for (unsigned int ii = 0; ii < ndof_loc; ++ii) std::cout << subproblems->m->nodes[ii] << " " << subproblems->result ()(ii, 0) << std::endl; MPI_Barrier (MPI_COMM_WORLD); } MPI_Finalize (); return 0; };
int main(int argc, char *argv[]) { long N=20, M=30; // number of cells NxM int n=2, m=3; // number of blocks nxm int tpi=16, tpj=18; // test pressure coordinates int tai=7, taj=9; // test average coordinates int i, j, I, J; // local and global i,j int myi, myj; // my i,j in neighbor map int bi, bj; // block size in y and x direction int numprocs, myid; // number of processors and my rank id double **P, **A; // 2D array of pressures and averages int **B; // 2D array with map of neighbors MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); // get command line arguments if any if (argc > 1) { if (argc != 5) { if (myid==0) { fprintf(stderr, "usage: prog [N M n m]\n"); fprintf(stderr, "Parameters:\n"); fprintf(stderr, "\tN: number of rows or cells in y direction. Default: %ld\n", N); fprintf(stderr, "\tM: number of columns or cells in x direction. Default: %ld\n", M); fprintf(stderr, "\tn: number of blocks in y direction. Default: %d\n", n); fprintf(stderr, "\tm: number of blocks in x direction. Default %d\n", m); } MPI_Finalize(); exit(3); } N = atoi(argv[1]); M = atoi(argv[2]); n = atoi(argv[3]); m = atoi(argv[4]); } bi = N/n; bj = M/m; // start message if (myid==0) { printf("Terapressure v0.1\n"); printf("=================\n"); printf("Number of cells: %lu (%lu x %lu)\n", N*M, N, M); printf("Number of blocks: %d (%d x %d)\n", n*m, n, m); printf("Number of processors %d\n", numprocs); printf("Block size: (%d x %d)\n", bi, bj); } // validate parameters if (N % n != 0 || M % m != 0) { if(myid==0) fprintf(stderr,"Number of blocks in x or y axis do not fit.\n"); MPI_Finalize(); exit(1); } if (numprocs != n*m) { if (myid==0) fprintf(stderr,"Number of processors must be the same as number of blocks: %d\n", n*m); MPI_Finalize(); exit(2); } double t = MPI_Wtime(); // memory allocation // stack allocation is simple but limited in size // double P[bi][bj]; // double A[bi][bj]; // int B[n][m]; // heap allocation P = malloc(sizeof(double*) * bi); A = malloc(sizeof(double*) * bi); for (i=0; i < bi; i++) { P[i] = malloc(sizeof(double) * bj); A[i] = malloc(sizeof(double) * bj); } B = malloc(sizeof(int*) * n); for (i=0; i < n; i++) { B[i] = malloc(sizeof(int) * m); } // domain decomposition int rank = 0; //printf("Neighbors map:\n"); for (i=0; i < n; i++) { for (j=0; j < m; j++) { if (rank == myid) { myi = i; myj = j; } B[i][j] = rank++; //printf ("%3d ", W[i][j]); } //printf ("\n"); } //printf("%d: my i,j in neighbor map: %d,%d\n", myid, myi, myj); // compute pressures // printf("%d: My pressures:\n", myid); double pressure = -1; for (i=0; i < bi; i++) { I = myi * bi + i; for (j=0; j < bj; j++) { J = myj * bj + j; if (I==0 || I==N-1 || J==0 || J==M-1) P[i][j] = 0; else P[i][j] = (double)(I+J) * (double)(I*J); //printf ("L(%d,%d) G(%d,%d): %.2f\t",i,j,I,J, P[i][j]); if (I == tpi && J == tpj) pressure = P[i][j]; } //printf ("\n"); } // average pressure int neighbor; double center, left, top, right, bottom; double average = -1; for (i=0; i < bi; i++) { I = myi * bi + i; for (j=0; j < bj; j++) { J = myj * bj + j; if ( I==0 || I==N-1 || J==0 || J==M-1 ) continue; center = P[i][j]; // top cell if (i==0) { neighbor = B[myi-1][myj]; MPI_Send(¢er, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD); MPI_Recv(&top, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0); //printf("%2d: send to %d (%d,%d): %.2f\n", myid, neighbor,I,J,center); //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I-1,J,top); } else { top = P[i-1][j]; } // bottom cell if (i==bi-1) { neighbor = B[myi+1][myj]; MPI_Send(¢er, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD); MPI_Recv(&bottom, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0); //printf("%2d: send to %d (%d,%d): %.2f\n", myid, neighbor,I,J,center); //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I+1,J,bottom); } else { bottom = P[i+1][j]; } // left cell if (j==0) { neighbor = B[myi][myj-1]; MPI_Send(¢er, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD); MPI_Recv(&left, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0); //printf("%2d: send to %d (%d,%d): %.2f\n", myid, neighbor,I,J,center); //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I,J-1,left); } else { left = P[i][j-1]; } // right cell if (j==bj-1) { neighbor = B[myi][myj+1]; MPI_Send(¢er, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD); MPI_Recv(&right, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0); //printf("%2d: send to %d (%d,%d): %.2f\n", myid, neighbor,I,J,center); //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I,J+1,right); } else { right = P[i][j+1]; } A[i][j] = ( center + left + top + right + bottom ) / 5; //printf ("L(%d,%d) G(%d,%d): %.2f\t",i,j,I,J, A[i][j]); if (I==tai && J==taj) average = A[i][j]; } //printf ("\n"); } // cleanup memory for (i=0; i < bi; i++) { free(P[i]); free(A[i]); } free(P); free(A); for (i=0; i < n; i++) { free(B[i]); } free(B); // report result //printf("Preasure at (16,18): %.2f\n", P[16][18]); //printf("Avg at (7,9): %.2f\n", A[7][9]); if (pressure > -1) printf("Preasure at (%2d,%2d): %.2f computed by processor %d\n", tpi, tpj, pressure, myid); if (average > -1) printf("Average at (%2d,%2d): %.2f computed by processor %d\n", tai, taj, average, myid); MPI_Barrier(MPI_COMM_WORLD); if (myid==0) printf("Time elapsed: %.2f seconds.\n", MPI_Wtime()-t); MPI_Finalize(); return 0; }
/* *************************** * * Main computational kernel * * *************************** */ int correlationKernel(int rank, int size, double* dataMatrixX, double* dataMatrixY, int columns, int rows, char *out_filename, int distance_flag) { int local_check = 0, global_check = 0; int i = 0, j, taskNo; int err, count = 0; unsigned long long fair_chunk = 0, coeff_count = 0; unsigned int init_and_cleanup_loop_iter=0; unsigned long long cor_cur_size = 0; double start_time, end_time; // Variables needed by the Indexed Datatype MPI_Datatype coeff_index_dt; MPI_File fh; int *blocklens, *indices; MPI_Status stat; MPI_Comm comm = MPI_COMM_WORLD; // Master processor keeps track of tasks if (rank == 0) { // Make sure everything will work fine even if there are // less genes than available workers (there are size-1 workers // master does not count) if ( (size-1) > rows ) init_and_cleanup_loop_iter = rows+1; else init_and_cleanup_loop_iter = size; // Start timer start_time = MPI_Wtime(); // Send out initial tasks (remember you have size-1 workers, master does not count) for (i=1; i<init_and_cleanup_loop_iter; i++) { taskNo = i-1; err = MPI_Send(&taskNo, 1, MPI_INT, i, 0, comm); } // Terminate any processes that were not working due to the fact // that the number of rows where less than the actual available workers for(i=init_and_cleanup_loop_iter; i < size; i++) { PROF(rank, "\nPROF_idle : Worker %d terminated due to insufficient work load", i); err = -1; err = MPI_Send(&err, 1, MPI_INT, i, 0, comm); } // Wait for workers to finish their work assignment and ask for more for (i=init_and_cleanup_loop_iter-1; i<rows; i++) { err = MPI_Recv(&taskNo, 1, MPI_INT, MPI_ANY_SOURCE, 0, comm, &stat); // Check taskNo to make sure everything is ok. Negative means there is problem // thus terminate gracefully all remaining working workers if ( taskNo < 0 ) { // Reduce by one because one worker is already terminated init_and_cleanup_loop_iter--; // Break and cleanup break; } // The sending processor is ready to work: // It's ID is in stat.MPI_SOURCE // Send it the current task (i) err = MPI_Send(&i, 1, MPI_INT, stat.MPI_SOURCE, 0, comm); } // Clean up processors for (i=1; i<init_and_cleanup_loop_iter; i++) { // All tasks complete - shutdown workers err = MPI_Recv(&taskNo, 1, MPI_INT, MPI_ANY_SOURCE, 0, comm, &stat); // If process failed then it will not be waiting to receive anything // We have to ignore the send because it will deadlock if ( taskNo < 0 ) continue; err = -1; err = MPI_Send(&err, 1, MPI_INT, stat.MPI_SOURCE, 0, comm); } // Master is *always* OK local_check = 0; MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); // Check failed, abort if ( global_check != 0 ) { return -1; } // Stop timer end_time = MPI_Wtime(); PROF(rank, "\nPROF_comp (workers=%d) : Time taken by correlation coefficients computations : %g\n", size-1, end_time - start_time); // Start timer start_time = MPI_Wtime(); // Master process must call MPI_File_set_view as well, it's a collective call // Open the file handler MPI_File_open(comm, out_filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); // Create the file view MPI_File_set_view(fh, 0, MPI_DOUBLE, MPI_DOUBLE, "native", MPI_INFO_NULL); // Write data to disk MPI_File_write_all(fh, &cor[0], 0, MPI_DOUBLE, &stat); // Stop timer end_time = MPI_Wtime(); PROF(rank, "\nPROF_write (workers=%d) : Time taken for global write-file : %g\n", size-1, end_time - start_time); } else { // Compute how many workers will share the work load // Two scenarios exist: // (1) more OR equal number of workers and rows exist // (2) more rows than workers if ( (size-1) > rows ) { // For this scenario each worker will get exaclty one work asssignment. // There is not going to be any other work so it only compute "rows" number // of coefficients fair_chunk = rows; cor_cur_size = fair_chunk; } else { // For this scenario we are going to allocate space equal to a fair // distribution of work assignments *plus* an extra amount of space to // cover any load imbalancing. This amount is expressed as a percentage // of the fair work distribution (see on top, 20% for now) // Plus 1 to round it up or just add some extra space, both are fine fair_chunk = (rows / (size-1)) + 1; DEBUG("fair_chunk %d \n", fair_chunk); // We can use "j" as temporary variable. // Plus 1 to avoid getting 0 from the multiplication. j = (fair_chunk * MEM_PERC) + 1; cor_cur_size = (fair_chunk + j) * rows; DEBUG("cor_cur_size %lld \n", cor_cur_size); } // Allocate memory DEBUG("cor_cur_size %lld \n", cor_cur_size); long long double_size = sizeof(double); DEBUG("malloc size %lld \n", (double_size * cor_cur_size)); cor = (double *)malloc(double_size * cor_cur_size); blocklens = (int *)malloc(sizeof(int) * rows); indices = (int *)malloc(sizeof(int) * rows); mean_value_vectorX = (double *)malloc(sizeof(double) * rows); Sxx_vector = (double *)malloc(sizeof(double) * rows); mean_value_vectorY = (double *)malloc(sizeof(double) * rows); Syy_vector = (double *)malloc(sizeof(double) * rows); // Check that all memory is successfully allocated if ( ( cor == NULL ) || ( blocklens == NULL ) || ( indices == NULL ) || ( mean_value_vectorX == NULL ) || ( Sxx_vector == NULL ) || ( mean_value_vectorY == NULL ) || ( Syy_vector == NULL ) ) { ERR("**ERROR** : Memory allocation failed on worker process %d. Aborting.\n", rank); // Free allocated memory free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector); // Let the master process know its aborting in order to terminate // the rest of the working workers // We have to receive a work assignment first and then terminate // otherwise the master will deadlock trying to give work to this worker err = MPI_Recv(&taskNo, 1, MPI_INT, 0, 0, comm, &stat); taskNo = -1; err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm); // This worker failed local_check = 1; MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); return -1; } // Compute necessary parameters for Pearson method // (this will transform the values of the input array to more meaningful data // and save us from a lot of redundant computations) compute_parameters(dataMatrixX, dataMatrixY, rows, columns); // Main loop for workers. They get work from master, compute coefficients, // save them to their *local* vector and ask for more work for(;;) { // Get work err = 0; err = MPI_Recv(&taskNo, 1, MPI_INT, 0, 0, comm, &stat); // If received task is -1, function is terminated if ( taskNo == -1 ) break; // Check if there is enough memory to store the new coefficients, if not reallocate // the current memory and expand it by MEM_PERC of the approximated size if ( cor_cur_size < (coeff_count + rows) ) { PROF(0, "\n**WARNING** : Worker process %3d run out of memory and reallocates. Potential work imbalancing\n", rank); DEBUG("\n**WARNING** : Worker process %3d run out of memory and reallocates. Potential work imbalancing\n", rank); // Use j as temporary again. Add two (or any other value) to avoid 0. // (two is just a random value, you can put any value really...) j = (fair_chunk * MEM_PERC) + 2; cor_cur_size += (j * rows); // Reallocate and check cor = (double *)realloc(cor, sizeof(double) * cor_cur_size); if ( cor == NULL ) { ERR("**ERROR** : Memory re-allocation failed on worker process %d. Aborting.\n", rank); // Let the master process know its aborting in order to terminate // the rest of the working workers taskNo = -1; err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm); // This worker failed local_check = 1; MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); // Free all allocated memory free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector); return -1; } } // Compute the correlation coefficients if(dataMatrixY != NULL) { for (j=0; j < rows; j++) { cor[coeff_count] = pearson_XY(dataMatrixX, dataMatrixY, j, taskNo, columns); coeff_count++; } } else { for (j=0; j < rows; j++) { // Set main diagonal to 1 if ( j == taskNo ) { cor[coeff_count] = 1.0; coeff_count++; continue; } cor[coeff_count] = pearson(dataMatrixX, taskNo, j, columns); coeff_count++; } } // The value of blocklens[] represents the number of coefficients on each // row of the corellation array blocklens[count] = rows; // The value of indices[] represents the offset of each row in the data file indices[count] = (taskNo * rows); count++; // Give the master the taskID err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm); } // There are two possibilities // (a) everything went well and all workers finished ok // (b) some processes finished ok but one or more of the remaining working workers failed // To make sure all is well an all-reduce will be performed to sync all workers and guarantee success // before moving on to write the output file // This worker is OK local_check = 0; MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); // Check failed if ( global_check != 0 ) { // Free all allocated memory free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector); return -1; } PROF(0, "\nPROF_stats (thread %3d) : Fair chunk of work : %d \t\t Allocated : %d \t\t Computed : %d\n", rank, fair_chunk, cor_cur_size, coeff_count); // If the distance_flag is set, then transform all correlation coefficients to distances if ( distance_flag == 1 ) { for(j=0; j < coeff_count; j++) { cor[j] = 1 - cor[j]; } } // Create and commit the Indexed datatype *ONLY* if there are data available if ( coeff_count != 0 ) { MPI_Type_indexed(count, blocklens, indices, MPI_DOUBLE, &coeff_index_dt); MPI_Type_commit(&coeff_index_dt); } // Open the file handler MPI_File_open(comm, out_filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); // Create the file view if ( coeff_count != 0 ) { MPI_File_set_view(fh, 0, MPI_DOUBLE, coeff_index_dt, "native", MPI_INFO_NULL); } else { MPI_File_set_view(fh, 0, MPI_DOUBLE, MPI_DOUBLE, "native", MPI_INFO_NULL); } // Write data to disk // TODO coeff_count cannot be greater than max int (for use in the MPI_File_write_all call). // A better fix should be possible, for now throw error. DEBUG("\ncoeff_count is %lld\n", coeff_count); DEBUG("\INT_MAX is %d\n", INT_MAX); if(coeff_count>INT_MAX) { ERR("**ERROR** : Could not run as the chunks of data are too large. Try running again with more MPI processes.\n"); // Free allocated memory free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector); // Let the master process know its aborting in order to terminate // the rest of the working workers // We have to receive a work assignment first and then terminate // otherwise the master will deadlock trying to give work to this worker err = MPI_Recv(&taskNo, 1, MPI_INT, 0, 0, comm, &stat); taskNo = -1; err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm); // This worker failed local_check = 1; MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); return -1; } DEBUG("\nWriting %d to disk\n", coeff_count); MPI_File_write_all(fh, &cor[0], coeff_count, MPI_DOUBLE, &stat); if (coeff_count != 0 ) MPI_Type_free(&coeff_index_dt); // Free all allocated memory free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector); } DEBUG("\nAbout to write to disk %d\n", rank); MPI_File_sync( fh ) ; // Causes all previous writes to be transferred to the storage device DEBUG("\nWritten to disk %d\n",rank); // MPI_Barrier( MPI_COMM_WORLD ) ; // Blocks until all processes in the communicator have reached this routine. DEBUG("\nAfter barrier \n", rank); // Close file handler MPI_File_close(&fh); DEBUG("\nAfter file closed /n"); // MPI_Barrier( MPI_COMM_WORLD ) ; // Blocks until all processes in the communicator have reached this routine. DEBUG("\nAbout to return from kernel /n"); return 0; }
int main( int argc, char *argv[] ) { int numprocs, myid, server, workerid, ranks[1], request, i, iter, ix, iy, done; long rands[CHUNKSIZE], max, in, out, totalin, totalout; double x, y, Pi, error, epsilon; MPI_Comm world, workers; MPI_Group world_group, worker_group; MPI_Status status; MPI_Init( &argc, &argv ); world = MPI_COMM_WORLD; MPI_Comm_size( world, &numprocs ); MPI_Comm_rank( world, &myid ); server = numprocs-1; // Last process is a random server /*** * Now Master should read epsilon from command line * and distribute it to all processes. */ if (myid == 0) // Read epsilon from command line sscanf( argv[1], "%lf", &epsilon ); MPI_Bcast( &epsilon, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD ); /*** * Create new process group called world_group containing all * processes and its communicator called world * and a group called worker_group containing all processes * except the last one (called here server) * and its communicator called workers. */ MPI_Comm_group( world, &world_group ); ranks[0] = server; MPI_Group_excl( world_group, 1, ranks, &worker_group ); MPI_Comm_create( world, worker_group, &workers ); MPI_Group_free( &worker_group ); MPE_XGraph graph; MPE_Open_graphics(&graph,MPI_COMM_WORLD,(char*)0, -1,-1,WINDOW_SIZE,WINDOW_SIZE,MPE_GRAPH_INDEPENDENT); /*** * Server part * * Server should loop until request code is 0, in each iteration: * - receiving request code from any slave * - generating a vector of CHUNKSIZE randoms <= INT_MAX * - sending vector back to slave */ if (myid == server) { // I am the random generator server do { MPI_Recv( &request, 1, MPI_INT, MPI_ANY_SOURCE, REQUEST, world, &status ); if (request) { for (i = 0; i < CHUNKSIZE; ) { rands[i] = rand(); if ( rands[i] <= INT_MAX ) i++; } MPI_Send( rands, CHUNKSIZE, MPI_LONG, status.MPI_SOURCE, REPLY, world ); } } while( request > 0 ); } /*** * Workers (including Master) part * * Worker should send initial request to server. * Later, in a loop worker should: * - receive vector of randoms * - compute x,y point inside unit square * - check (and count result) if point is inside/outside * unit circle * - sum both counts over all workers * - calculate pi and its error (from "exact" value) * - test if error is within epsilon limit * - test continuation condition (error and max. points limit) * - print pi by master only * - send a request to server (all if more or master only if finish) * Before finishing workers should free their communicator. */ else { // I am a worker process request = 1; done = 0; in = out = 0; max = INT_MAX; // max int, for normalization MPI_Send( &request, 1, MPI_INT, server, REQUEST, world ); MPI_Comm_rank( workers, &workerid ); iter = 0; while (!done) { iter++; request = 1; MPI_Recv( rands, CHUNKSIZE, MPI_LONG, server, REPLY, world, &status ); for (i = 0; i < CHUNKSIZE - 1; ) { x = (((double) rands[i++])/max) * 2 - 1; y = (((double) rands[i++])/max) * 2 - 1; if ( x*x + y*y < 1.0 ) { MPE_Draw_point(graph,(int)(WINDOW_SIZE/2+x*WINDOW_SIZE/2),(int)(WINDOW_SIZE+y*WINDOW_SIZE/2),MPE_RED); in++; } else out++; } MPI_Allreduce( &in, &totalin, 1, MPI_LONG, MPI_SUM, workers ); MPI_Allreduce( &out, &totalout, 1, MPI_LONG, MPI_SUM, workers ); Pi = ( 4.0 * totalin ) / ( totalin + totalout ); error = fabs( Pi - PI ); done = ( error < epsilon || (totalin + totalout) > THROW_MAX ); request = (done) ? 0 : 1; MPE_Update(graph); if (myid == 0) { printf( "\rpi = %23.20f", Pi ); MPI_Send( &request, 1, MPI_INT, server, REQUEST, world ); } else { if (request) MPI_Send( &request, 1, MPI_INT, server, REQUEST, world ); } } MPI_Comm_free( &workers ); } /*** * Master should print final point counts. */ if (myid == 0) { printf( "\npoints: %ld\nin: %ld, out: %ld, <ret> to exit\n", totalin+totalout, totalin, totalout ); getchar(); } MPE_Close_graphics(graph); MPI_Finalize(); return 0; }
int main(int argc, char** argv) { int my_rank, p; int i, dest; mpz_t currentPrime; unsigned long int product; sscanf(argv[1], "%lu", &product); int secondFactor = 0; int bcastStatus; int equals; /** GMP library variables **/ mpz_t nextPrimeNumber; mpz_t testFactor; mpz_init(nextPrimeNumber); mpz_init_set_str (nextPrimeNumber, argv[1], 10); mpz_init(testFactor); mpz_init_set_ui(currentPrime, 2); mpz_nextprime(nextPrimeNumber, nextPrimeNumber); mpz_t testProduct; mpz_init(testProduct); /** MPI Initialization **/ MPI_Request finalValue; MPI_File out; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Status status; /** Get Ready to receive a factor if another process finds one */ MPI_Irecv(&secondFactor, 1, MPI_UNSIGNED_LONG, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &finalValue); /** Prepare initial offset for each process **/ for (i=0 ; i < my_rank ; i++) { mpz_nextprime(currentPrime, currentPrime); } /** Start Timing **/ double start = MPI_Wtime(), diff; while (!secondFactor) { /** Check if another process has found the factors **/ MPI_Test (&finalValue, &bcastStatus, &status); if(bcastStatus) { /** Somebody else has found the factors, we are done **/ MPI_Wait(&finalValue, &status); break; } /** Skip P primes before checking again **/ for (i=0 ; i < p ; i++) { mpz_nextprime(currentPrime, currentPrime); } /** Brute force check if the current working prime is a factor of the input number **/ for (mpz_set_ui(testFactor , 2) ; mpz_get_ui(testFactor) <= mpz_get_ui(currentPrime); mpz_nextprime(testFactor, testFactor)) { /** Check if another process has found the factors **/ MPI_Test (&finalValue, &bcastStatus, &status); if(bcastStatus) { MPI_Wait(&finalValue, &status); break; } mpz_mul_ui(testProduct, currentPrime, mpz_get_ui(testFactor)); equals = mpz_cmp_ui(testProduct, product); if (equals == 0){ /** We've found the factor, find the second number, secnd it to the other processes **/ secondFactor = mpz_get_ui(testFactor); printf("done by process %d, factors are %lu and %d \n", my_rank, mpz_get_ui(currentPrime), secondFactor); fflush(stdout); for (dest = 0 ; dest < p ; dest++) { if (dest != my_rank) { MPI_Send(&secondFactor, 1, MPI_UNSIGNED_LONG, dest, 0, MPI_COMM_WORLD); } } } } } diff = MPI_Wtime() - start; /** End Timing **/ /** Prepare file contents **/ char fileName[200], fileContents[200]; sprintf(fileName, "time_%lu", product); sprintf(fileContents, "%d\t%f\n", my_rank, diff); /** Write File **/ MPI_File_open( MPI_COMM_WORLD, fileName, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &out ); MPI_File_seek(out, my_rank*strlen ( fileContents ) , MPI_SEEK_SET); MPI_File_write_all(out , &fileContents, strlen ( fileContents ), MPI_CHAR, &status ); MPI_File_close(&out); /** Fin **/ MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); return(0); }
static void _coupling_mpi_set_synchronize_roots(ple_coupling_mpi_set_t *s, int sync_flag, double time_step, _mpi_double_int_t glob_vals[]) { int i; MPI_Status status; int app_rank; int sync_root = -1; MPI_Comm_rank(s->app_comm, &app_rank); /* Return immediately if not application root */ if (app_rank != 0 || (s->app_status[s->app_id] & PLE_COUPLING_NO_SYNC)) return; /* First, sync data to root */ for (i = 0; i < s->n_apps; i++) { if (! (s->app_status[i] & PLE_COUPLING_NO_SYNC)) { sync_root = i; break; } } if (sync_root == s->app_id) { for (i = 0; i < s->n_apps; i++) { if (s->app_status[i] & PLE_COUPLING_NO_SYNC) { /* Keep previous values */ glob_vals[i].i = s->app_status[i]; glob_vals[i].d = s->app_timestep[i]; } else { if (i != sync_root) MPI_Recv(&(glob_vals[i]), 1, MPI_DOUBLE_INT, s->app_info[i*4], _coupling_tag, s->base_comm, &status); else { glob_vals[i].i = sync_flag; glob_vals[i].d = time_step; } } } } else if (! (s->app_status[s->app_id] & PLE_COUPLING_NO_SYNC)) { _mpi_double_int_t send_vals; send_vals.i = sync_flag; send_vals.d = time_step; MPI_Send(&send_vals, 1, MPI_DOUBLE_INT, s->app_info[sync_root], _coupling_tag, s->base_comm); } /* Now, root sends data to all */ if (sync_root == s->app_id) { for (i = 0; i < s->n_apps; i++) { if (i != sync_root && ! (s->app_status[i] & PLE_COUPLING_NO_SYNC)) MPI_Send(glob_vals, s->n_apps, MPI_DOUBLE_INT, s->app_info[i*4], _coupling_tag, s->base_comm); } } else MPI_Recv(glob_vals, s->n_apps, MPI_DOUBLE_INT, s->app_info[sync_root], _coupling_tag, s->base_comm, &status); }
PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out) { KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data; MPI_Comm comm; PetscScalar *Qloc = agmres->Qloc; PetscScalar *sgn = agmres->sgn; PetscScalar *tloc = agmres->tloc; PetscMPIInt rank = agmres->rank; PetscMPIInt First = agmres->First, Last = agmres->Last; PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft; PetscScalar *y, *zloc; PetscErrorCode ierr; PetscInt nloc,d, len, i, j; PetscBLASInt bnvec,pas,blen; PetscInt dpt; PetscReal c, s, rho, zp, zq, yd, tt; MPI_Status status; PetscFunctionBegin; ierr = PetscBLASIntCast(nvec,&bnvec);CHKERRQ(ierr); ierr = PetscObjectGetComm((PetscObject)ksp,&comm);CHKERRQ(ierr); pas = 1; ierr = VecGetLocalSize(VEC_V(0), &nloc);CHKERRQ(ierr); ierr = PetscMalloc1(nvec, &y);CHKERRQ(ierr); ierr = PetscMemcpy(y, In, nvec*sizeof(PetscScalar));CHKERRQ(ierr); ierr = VecGetArray(Out, &zloc);CHKERRQ(ierr); if (rank == Last) { for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i]; } for (i = 0; i < nloc; i++) zloc[i] = 0.0; if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas)); else { for (d = nvec - 1; d >= 0; d--) { if (rank == First) { ierr = MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);CHKERRQ(ierr); } else { for (j = nvec - 1; j >= d + 1; j--) { i = j - d; ierr = KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);CHKERRQ(ierr); zp = zloc[i-1]; zq = zloc[i]; zloc[i-1] = c * zp + s * zq; zloc[i] = -s * zp + c * zq; } ierr = KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);CHKERRQ(ierr); if (rank == Last) { zp = y[d]; zq = zloc[0]; y[d] = c * zp + s * zq; zloc[0] = -s * zp + c * zq; ierr = MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);CHKERRQ(ierr); } else { ierr = MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);CHKERRQ(ierr); zp = yd; zq = zloc[0]; yd = c * zp + s * zq; zloc[0] = -s * zp + c * zq; ierr = MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);CHKERRQ(ierr); } } } } for (j = nvec - 1; j >= 0; j--) { dpt = j * nloc + j; if (tloc[j] != 0.0) { len = nloc - j; ierr = PetscBLASIntCast(len,&blen);CHKERRQ(ierr); rho = Qloc[dpt]; Qloc[dpt] = 1.0; tt = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas)); PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas)); Qloc[dpt] = rho; } } ierr = VecRestoreArray(Out, &zloc);CHKERRQ(ierr); ierr = PetscFree(y);CHKERRQ(ierr); PetscFunctionReturn(0); }
void Foam::mpiPstreamImpl::reduce(scalar& Value, const sumOp<scalar>& bop) { if (!Pstream::parRun()) { return; } if (Pstream::nProcs() <= Pstream::nProcsSimpleSum) { if (Pstream::master()) { for ( int slave=Pstream::firstSlave(); slave<=Pstream::lastSlave(); slave++ ) { scalar value; if ( MPI_Recv ( &value, 1, MPI_SCALAR, Pstream::procID(slave), Pstream::msgType(), MPI_COMM_WORLD, MPI_STATUS_IGNORE ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Recv failed" << Foam::abort(FatalError); } Value = bop(Value, value); } } else { if ( MPI_Send ( &Value, 1, MPI_SCALAR, Pstream::procID(Pstream::masterNo()), Pstream::msgType(), MPI_COMM_WORLD ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Send failed" << Foam::abort(FatalError); } } if (Pstream::master()) { for ( int slave=Pstream::firstSlave(); slave<=Pstream::lastSlave(); slave++ ) { if ( MPI_Send ( &Value, 1, MPI_SCALAR, Pstream::procID(slave), Pstream::msgType(), MPI_COMM_WORLD ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Send failed" << Foam::abort(FatalError); } } } else { if ( MPI_Recv ( &Value, 1, MPI_SCALAR, Pstream::procID(Pstream::masterNo()), Pstream::msgType(), MPI_COMM_WORLD, MPI_STATUS_IGNORE ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Recv failed" << Foam::abort(FatalError); } } } else { scalar sum; MPI_Allreduce(&Value, &sum, 1, MPI_SCALAR, MPI_SUM, MPI_COMM_WORLD); Value = sum; /* int myProcNo = Pstream::myProcNo(); int nProcs = Pstream::nProcs(); // // receive from children // int level = 1; int thisLevelOffset = 2; int childLevelOffset = thisLevelOffset/2; int childProcId = 0; while ( (childLevelOffset < nProcs) && (myProcNo % thisLevelOffset) == 0 ) { childProcId = myProcNo + childLevelOffset; scalar value; if (childProcId < nProcs) { if ( MPI_Recv ( &value, 1, MPI_SCALAR, Pstream::procID(childProcId), Pstream::msgType(), MPI_COMM_WORLD, MPI_STATUS_IGNORE ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Recv failed" << Foam::abort(FatalError); } Value = bop(Value, value); } level++; thisLevelOffset <<= 1; childLevelOffset = thisLevelOffset/2; } // // send and receive from parent // if (!Pstream::master()) { int parentId = myProcNo - (myProcNo % thisLevelOffset); if ( MPI_Send ( &Value, 1, MPI_SCALAR, Pstream::procID(parentId), Pstream::msgType(), MPI_COMM_WORLD ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Send failed" << Foam::abort(FatalError); } if ( MPI_Recv ( &Value, 1, MPI_SCALAR, Pstream::procID(parentId), Pstream::msgType(), MPI_COMM_WORLD, MPI_STATUS_IGNORE ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Recv failed" << Foam::abort(FatalError); } } // // distribute to my children // level--; thisLevelOffset >>= 1; childLevelOffset = thisLevelOffset/2; while (level > 0) { childProcId = myProcNo + childLevelOffset; if (childProcId < nProcs) { if ( MPI_Send ( &Value, 1, MPI_SCALAR, Pstream::procID(childProcId), Pstream::msgType(), MPI_COMM_WORLD ) ) { FatalErrorIn ( "reduce(scalar& Value, const sumOp<scalar>& sumOp)" ) << "MPI_Send failed" << Foam::abort(FatalError); } } level--; thisLevelOffset >>= 1; childLevelOffset = thisLevelOffset/2; } */ } }
void terminate_children (int nproc) { // MPI_Abort(MPI_COMM_WORLD, 0); for (int i = 1; i < nproc; i++) MPI_Send(NULL, 0, MPI_DOUBLE, i, exit_tag, MPI_COMM_WORLD); }
int main(int argc, char *argv[] ) { double time1, time2; time1 = MPI_Wtime(); int rank, processors; int j; // number of iterations int k; // number of iterations to perform before creating a checkpoint int l; // number of random samples per grid point int checkpoint_resume = 0; // 1 = resume from last checkpoint int c; // used to hold a character int i=0, row = 0, col = 0, pln = 0; // array iterators char ***local_array; char **local_array_2nd; char *local_array_pointer; char ***local_array_copy; char **local_array_copy_2nd; char *local_array_copy_pointer; char ***temp, *temp_pointer; int file_open_error; int command_line_incomplete = 0; int grid_size[3] = {0,0,0}; int proc_size[3] = {0,0,0}; int local_size[3] = {0,0,0}; int remainder_size[3] = {0,0,0}; int coords[3] = {0,0,0}; int start_indices[3] = {0,0,0}; int periods[3] = {0,0,0}; int mem_size[3] = {0,0,0}; MPI_Status status; MPI_Datatype filetype, memtype; MPI_File fh; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &processors); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Interpret the command line arguments -------------------------------- if (rank == 0) { if (argc < 6 || argc > 8) { fputs("usage: x y z j k l r\n", stderr); fputs("where: x,y,z = x, y and z dimensions\n", stderr); fputs(" j = how many times the game of life is played\n", stderr); fputs(" k = checkpoint every k iterations\n", stderr); fputs(" l = number of random samples per grid point\n", stderr); fputs(" r = resume from the last checkpoint\n", stderr); fputs(INITIAL, stderr); fputs(" must be present.\n", stderr); fputs(CHECKPOINT, stderr); fputs(" must be present if resuming from the last checkpoint.\n", stderr); exit(EXIT_FAILURE); } } j = (int) strtol(argv[4], NULL, 10); k = (int) strtol(argv[5], NULL, 10); l = (int) strtol(argv[6], NULL, 10); if ( argc == 7 ) if ( argv[6][0] == 'r' ) checkpoint_resume = 1; if (rank == 0) printf("%d iterations \ncheckpoint every %d iterations \n%d samples per grid point \ncheckpoint resume = %d\n", j,k,l,checkpoint_resume); grid_size[0] = (int) strtol(argv[1], NULL, 10); grid_size[1] = (int) strtol(argv[2], NULL, 10); grid_size[2] = (int) strtol(argv[3], NULL, 10); if (rank==0) printf("grid_size: %d, %d, %d\n", grid_size[0], grid_size[1], grid_size[2]); MPI_Dims_create(processors, 3, proc_size); if (rank==0) printf("proc_size: %d, %d, %d\n", proc_size[0], proc_size[1], proc_size[2]); local_size[0] = grid_size[0] / proc_size[0]; local_size[1] = grid_size[1] / proc_size[1]; local_size[2] = grid_size[2] / proc_size[2]; if (rank==0) printf("local_size: %d, %d, %d\n", local_size[0], local_size[1], local_size[2]); remainder_size[0] = grid_size[0] % proc_size[0]; remainder_size[1] = grid_size[1] % proc_size[1]; remainder_size[2] = grid_size[2] % proc_size[2]; if (rank==0) printf("remainder_size: %d, %d, %d\n", remainder_size[0], remainder_size[1], remainder_size[2]); if (remainder_size[0] != 0 || remainder_size[1] != 0 || remainder_size[2] != 0) { fputs("remainder size != 0, check your dimensions", stderr); MPI_Finalize(); exit(EXIT_FAILURE); } MPI_Comm comm; MPI_Cart_create(MPI_COMM_WORLD, 3, proc_size, periods, 0, &comm); MPI_Comm_rank(comm, &rank); MPI_Cart_coords(comm, rank, 3, coords); start_indices[0] = coords[0] * local_size[0]; start_indices[1] = coords[1] * local_size[1]; start_indices[2] = coords[2] * local_size[2]; /* printf("A coords R%d: (%d, %d, %d) (%d, %d, %d)\n", rank, coords[0], coords[1], coords[2], start_indices[0], start_indices[1], start_indices[2]);*/ fflush(stdout); // create the file type --------------------------------------------------- MPI_Type_create_subarray(3, grid_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &filetype); MPI_Type_commit(&filetype); // create a local memory type with ghost rows ----------------------------- mem_size[0] = local_size[0] + 2; mem_size[1] = local_size[1] + 2; mem_size[2] = local_size[2] + 2; start_indices[0] = start_indices[1] = start_indices[2] = 1; MPI_Type_create_subarray(3, mem_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &memtype); MPI_Type_commit(&memtype); // find my neighbors ------------------------------------------------------ int nxminus, nxplus, nyminus, nyplus, nzminus, nzplus, tag = 333, *neighbors; // Neighbors Array: row- col- col+ row+ plane- plane+ neighbors = (int *) malloc(6 * sizeof(int)); for(i=0; i<6; i++) neighbors[i] = rank; MPI_Cart_shift(comm, 0, 1, &nxminus, &nxplus); MPI_Cart_shift(comm, 1, 1, &nyminus, &nyplus); MPI_Cart_shift(comm, 2, 1, &nzminus, &nzplus); // printf(" %d sending south to %d receiving from %d \n",rank,nxplus,nxminus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nxplus, tag, &(neighbors[0]), 1, MPI_INT, nxminus, tag, comm, &status); // printf(" %d sending North to %d receiving from %d \n",rank,nxminus,nxplus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nxminus, tag, &(neighbors[3]), 1, MPI_INT, nxplus, tag, comm, &status); // printf(" %d sending East to %d receiving from %d \n",rank,nyplus,nyminus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nyplus, tag, &neighbors[1], 1, MPI_INT, nyminus, tag, comm, &status); // printf(" %d sending West to %d receiving from %d \n",rank,nyminus,nyplus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nyminus, tag, &neighbors[2], 1, MPI_INT, nyplus, tag, comm, &status); // printf(" %d sending backwards to %d receiving from %d \n",rank,nzplus,nzminus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nzplus, tag, &(neighbors[4]), 1, MPI_INT, nzminus, tag, comm, &status); // printf(" %d sending forward to %d receiving from %d \n",rank,nzminus,nzplus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nzminus, tag, &(neighbors[5]), 1, MPI_INT, nzplus, tag, comm, &status); /* printf("neighboors R%d : (row-) %d (col-) %d (col+) %d (row+) %d (plane-) %d (plane+) %d\n",rank,neighbors[0],neighbors[1],neighbors[2],neighbors[3],neighbors[4],neighbors[5]);*/ fflush(stdout); //init_sprng(1,time(0),SPRNG_DEFAULT); srand((unsigned int)time(NULL)); // Open the initial condition (checkpoint or not) ---------------------- if ( checkpoint_resume ) { file_open_error = MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); } else { file_open_error = MPI_File_open(MPI_COMM_WORLD, INITIAL, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); } if (file_open_error != MPI_SUCCESS) { if (checkpoint_resume) fputs(CHECKPOINT, stderr); else fputs(INITIAL, stderr); fputs(" could not be opened.\n", stderr); exit(EXIT_FAILURE); } // Allocate and Populate the local array ---------------------------------- local_array_copy_pointer = (char *) malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char)); local_array_copy_2nd = (char **) malloc(mem_size[0] * mem_size[1] * sizeof(char*)); local_array_copy = (char ***) malloc(mem_size[0] * sizeof(char*)); for(i = 0; i < mem_size[0] * mem_size[1]; i++) local_array_copy_2nd[i] = &local_array_copy_pointer[i * mem_size[2]]; for(i = 0; i < mem_size[0]; i++) local_array_copy[i] = &local_array_copy_2nd[i * mem_size[1]]; local_array_pointer = (char *) malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char)); local_array_2nd = (char **) malloc(mem_size[0] * mem_size[1] * sizeof(char*)); local_array = (char ***) malloc(mem_size[0] * sizeof(char*)); for(i = 0; i < mem_size[0] * mem_size[1]; i++) local_array_2nd[i] = &local_array_pointer[i * mem_size[2]]; for(i = 0; i < mem_size[0]; i++) local_array[i] = &local_array_2nd[i * mem_size[1]]; // if (rank==0) printf("Malloc complete\n"); for(row=0; row<mem_size[0]; row++) { for(col=0; col<mem_size[1]; col++) { for(pln=0; pln<mem_size[2]; pln++) { local_array[row][col][pln] = local_array_copy[row][col][pln] = '0'; } } } // if (rank==0) printf("Setup complete\n"); MPI_File_read_all(fh, local_array_pointer, 1, memtype, &status); if (rank==0) printf("File Read\n"); // if (rank==0) { // for(row=0; row<mem_size[0]; row++) { // for(col=0; col<mem_size[1]; col++) { // for(pln=0; pln<mem_size[2]; pln++) { // printf("%c", local_array[row][col][pln]); // } // printf("\n"); // } // printf("-----------------------\n"); // } // } MPI_File_close(&fh); // Construct the plane data types MPI_Datatype yzplane; MPI_Type_vector(local_size[1], local_size[2], local_size[2]+2, MPI_CHAR, &yzplane); MPI_Type_commit(&yzplane); MPI_Datatype xzplane; MPI_Type_vector(local_size[0], local_size[2], ((local_size[2]+2)*local_size[1])+((local_size[2]+2)*2), MPI_CHAR, &xzplane); MPI_Type_commit(&xzplane); // this type will also copy the corner x columns, can't skip blocks intermittently // since we aren't worrying about the corner data, it's ok MPI_Datatype xyplane; MPI_Type_vector((local_size[0]*local_size[1])+((local_size[0]*2)-2), 1, local_size[2]+2, MPI_CHAR, &xyplane); MPI_Type_commit(&xyplane); MPI_Barrier(comm); // start the iteration loop int iterations; int kCounter = k; for (iterations = 0; iterations < j; iterations++) { // send updated planes // Neighbors Array: // 0 1 2 3 4 5 // row- col- col+ row+ plane- plane+ // Note: corners are not handled // send top yzplane if (rank != neighbors[0]) MPI_Send(&local_array[1][1][1], 1, yzplane, neighbors[0], 0, comm); // recv bottom yzplane if (rank != neighbors[3]) MPI_Recv(&local_array[local_size[0]+1][1][1], 1, yzplane, neighbors[3], 0, comm, &status); // send bottom yzplane if (rank != neighbors[3]) MPI_Send(&local_array[local_size[0]][1][1], 1, yzplane, neighbors[3], 0, comm); // recv top yzplane if (rank != neighbors[0]) MPI_Recv(&local_array[0][1][1], 1, yzplane, neighbors[0], 0, comm, &status); // send left xzplane if (rank != neighbors[1]) MPI_Send(&local_array[1][1][1], 1, xzplane, neighbors[1], 0, comm); // recv right xzplane if (rank != neighbors[2]) MPI_Recv(&local_array[1][local_size[1]+1][1], 1, xzplane, neighbors[2], 0, comm, &status); // send right xzplane if (rank != neighbors[2]) MPI_Send(&local_array[1][local_size[1]][1], 1, xzplane, neighbors[2], 0, comm); // recv left xzplane if (rank != neighbors[1]) MPI_Recv(&local_array[1][0][1], 1, xzplane, neighbors[1], 0, comm, &status); // send front xyplane if (rank != neighbors[4]) MPI_Send(&local_array[1][1][1], 1, xyplane, neighbors[4], 0, comm); // recv back xyplane if (rank != neighbors[5]) MPI_Recv(&local_array[1][1][local_size[2]+1], 1, xyplane, neighbors[5], 0, comm, &status); // send back xyplane if (rank != neighbors[5]) MPI_Send(&local_array[1][1][local_size[2]], 1, xyplane, neighbors[5], 0, comm); // recv front xyplane if (rank != neighbors[4]) MPI_Recv(&local_array[1][1][0], 1, xyplane, neighbors[4], 0, comm, &status); // if (rank==0) { // for(row=0; row<mem_size[0]; row++) { // for(col=0; col<mem_size[1]; col++) { // for(pln=0; pln<mem_size[2]; pln++) { // printf("%c", local_array[row][col][pln]); // } // printf("\n"); // } // printf("-----------------------\n"); // } // } // run the game of life // gameOfLife(local_array, local_array_copy, local_size[0], local_size[1], l, rank); // swap the arrays // temp1 = local_array; // local_array = local_array_copy; // local_array_copy = temp1; // // temp2 = local_array_pointer; // local_array_pointer = local_array_copy_pointer; // local_array_copy_pointer = temp2; // check to see if this iteration needs a checkpoint kCounter--; if (kCounter == 0) { kCounter = k; // checkpoint code MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status); MPI_File_close(&fh); if (rank == 0) printf("Checkpoint made: Iteration %d\n", iterations+1); } // end if kCounter == 0 } // end iteration loop iterations--; // all done! repeat the checkpoint process MPI_File_open(MPI_COMM_WORLD, FINAL_RESULTS, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status); MPI_File_close(&fh); if (rank == 0) printf("Final Results made: Iteration %d\n", iterations+1); time2 = MPI_Wtime(); if (rank == 0) printf("Elapsed Seconds: %f\n", time2-time1);fflush(stdout); MPI_Finalize(); return EXIT_SUCCESS; }
/****************************************************************************** * * The main worker node function. * * int thread_id: the thread_id * char *fastq1: FIFO from which bowtie2 can get read1 * char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) * *******************************************************************************/ void herd_worker_node(int thread_id, char *fastq1, char *fastq2) { int cmd_length = 1, max_qname = 0, status, strand; char *cmd, *last_qname = calloc(1, sizeof(char)); MPI_Header *packed_header; MPI_read *packed_read = calloc(1, sizeof(MPI_read)); bam_hdr_t *header; bam1_t *read1 = bam_init1(); bam1_t *read2 = bam_init1(); samFile *fp; #ifdef DEBUG MPI_Status stat; int current_p_size = 100; htsFile *of; bam_hdr_t *debug_header = bam_hdr_init(); bam1_t *debug_read = bam_init1(); global_header = bam_hdr_init(); void *p = calloc(100,1); char *oname = NULL; #else int i = 0; #endif time_t t0, t1; int swapped = 0; assert(last_qname); assert(packed_read); //Which strand should we be aligning to? if(config.directional) { strand = (thread_id-1) % 2; } else { strand = (thread_id-1) % 4; } packed_read->size = 0; packed_read->packed = NULL; //construct the bowtie2 command cmd_length += (int) strlen("bowtie2 -q --reorder") + 1; cmd_length += (int) strlen(config.bowtie2_options) + 1; cmd_length += (int) strlen("--norc -x") + 1; cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1; cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3; if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded. #ifdef DEBUG oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam"))); assert(oname); sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id); if(!config.quiet) fprintf(stderr, "Writing output to %s\n", oname); of = sam_open(oname, "wb"); free(oname); #endif cmd = (char *) malloc(sizeof(char) * cmd_length); assert(cmd); if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand if(config.paired) { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); } else { sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); } } else { fprintf(stderr, "Oh shit, got strand %i!\n", strand); return; } //Start the process if(!config.quiet) fprintf(stderr, "Node %i executing: %s\n", thread_id, cmd); fflush(stderr); fp = sam_popen(cmd); header = sam_hdr_read(fp); #ifdef DEBUG sam_hdr_write(of, header); #endif #ifndef DEBUG packed_header = pack_header(header); if(thread_id == 1) { //Send the header MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD); status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD); if(status != MPI_SUCCESS) { fprintf(stderr, "MPI_Send returned %i\n", status); fflush(stderr); } } #else packed_header = pack_header(header); void *tmp_pointer = malloc(packed_header->size); assert(tmp_pointer); MPI_Request request; MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request); status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat); if(status != MPI_SUCCESS) fprintf(stderr, "We seem to have not been able to send the message to ourselves!\n"); MPI_Wait(&request, &stat); unpack_header(debug_header, tmp_pointer); global_header = debug_header; free(tmp_pointer); #endif t0 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stderr); while(sam_read1(fp, header, read1) >= 0) { #ifdef DEBUG sam_write1(of, global_header, read1); #endif if(strcmp(bam_get_qname(read1), last_qname) == 0) { //Multimapper if(config.paired) { sam_read1(fp, header, read2); #ifdef DEBUG sam_write1(of, global_header, read2); #endif } continue; } else { if(read1->core.l_qname > max_qname) { max_qname = read1->core.l_qname + 10; last_qname = realloc(last_qname, sizeof(char) * max_qname); assert(last_qname); } strcpy(last_qname, bam_get_qname(read1)); } //Are paired-end reads in the wrong order? swapped = 0; if(config.paired) { if(read1->core.flag & BAM_FREAD2) { swapped = 1; sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } } //Send the read packed_read = pack_read(read1, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); #endif //Deal with paired-end reads if(config.paired && !swapped) { sam_read1(fp, header, read2); packed_read = pack_read(read2, packed_read); #ifndef DEBUG MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); #else sam_write1(of, global_header, read2); if(packed_read->size > current_p_size) { p = realloc(p, packed_read->size); assert(p); } MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); MPI_Wait(&request, &stat); debug_read = unpack_read(debug_read, p); #endif } #ifndef DEBUG i++; #endif } t1 = time(NULL); if(!config.quiet) fprintf(stderr, "Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stderr); //Notify the master node packed_read->size = 0; #ifndef DEBUG void *A = malloc(1); assert(A); MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD); free(A); #endif //Close things up bam_hdr_destroy(header); bam_destroy1(read1); bam_destroy1(read2); free(cmd); if(packed_read->packed != NULL) free(packed_read->packed); free(packed_read); if(packed_header->packed != NULL) free(packed_header->packed); free(packed_header); free(last_qname); sam_pclose(fp); //Remove the FIFO(s) unlink(fastq1); if(config.paired) unlink(fastq2); #ifdef DEBUG sam_close(of); bam_hdr_destroy(debug_header); bam_destroy1(debug_read); free(p); #endif if(!config.quiet) fprintf(stderr, "Exiting worker node %i\n", thread_id); fflush(stderr); };
/*! \brief * * <pre> * Purpose * ======= * * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a * graph partitioning algorithm to the symmetrized graph A+A'. The * multilevel graph partitioning algorithm used is the * ParMETIS_V3_NodeND routine available in the parallel graph * partitioning package parMETIS. * * The number of independent sub-domains noDomains computed by this * algorithm has to be a power of 2. Hence noDomains is the larger * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow * * npcol is the number of processors used in SuperLU_DIST. * * Arguments * ========= * * A (input) SuperMatrix* * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number * of the linear equations is A->nrow. Matrix A is distributed * in NRformat_loc format. * * perm_r (input) int_t* * Row permutation vector of size A->nrow, which defines the * permutation matrix Pr; perm_r[i] = j means row i of A is in * position j in Pr*A. * * perm_c (output) int_t* * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * nprocs_i (input) int* * Number of processors the input matrix is distributed on in a block * row format. It corresponds to number of processors used in * SuperLU_DIST. * * noDomains (input) int*, must be power of 2 * Number of independent domains to be computed by the graph * partitioning algorithm. ( noDomains <= nprocs_i ) * * sizes (output) int_t**, of size 2 * noDomains * Returns pointer to an array containing the number of nodes * for each sub-domain and each separator. Separators are stored * from left to right. * Memory for the array is allocated in this routine. * * fstVtxSep (output) int_t**, of size 2 * noDomains * Returns pointer to an array containing first node for each * sub-domain and each separator. * Memory for the array is allocated in this routine. * * Return value * ============ * < 0, number of bytes allocated on return from the symbolic factorization. * > 0, number of bytes allocated when out of memory. * </pre> */ float get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c, int nprocs_i, int noDomains, int_t **sizes, int_t **fstVtxSep, gridinfo_t *grid, MPI_Comm *metis_comm) { NRformat_loc *Astore; int iam, p; int *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int; int *options, numflag; int_t m_loc, nnz_loc, fst_row; int_t m, n, bnz, i, j; int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes; int_t *b_rowptr, *b_colind; int_t *dist_order; int *recvcnts, *displs; /* first row index on each processor when the matrix is distributed on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */ int_t *vtxdist_i, *vtxdist_o; int_t szSep, k, noNodes; float apat_mem_l; /* memory used during the computation of the graph of A+A' */ float mem; /* Memory used during this routine */ MPI_Status status; /* Initialization. */ MPI_Comm_rank (grid->comm, &iam); n = A->ncol; m = A->nrow; if ( m != n ) ABORT("Matrix is not square"); mem = 0.; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()"); #endif Astore = (NRformat_loc *) A->Store; nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */ m_loc = Astore->m_loc; /* number of rows local to this processor */ fst_row = Astore->fst_row; /* global index of the first row */ rowptr = Astore->rowptr; /* pointer to rows and column indices */ colind = Astore->colind; #if ( PRNTlevel>=1 ) if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n", noDomains); #endif numflag = 0; /* determine first row on each processor */ vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i."); vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o."); MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t, grid->comm); vtxdist_i[nprocs_i] = m; if (noDomains == nprocs_i) { /* keep the same distribution of A */ for (p = 0; p <= nprocs_i; p++) vtxdist_o[p] = vtxdist_i[p]; } else { i = n / noDomains; j = n % noDomains; for (k = 0, p = 0; p < noDomains; p++) { vtxdist_o[p] = k; k += i; if (p < j) k++; } /* The remaining non-participating processors get the same first-row-number as the last processor. */ for (p = noDomains; p <= nprocs_i; p++) vtxdist_o[p] = k; } #if ( DEBUGlevel>=2 ) if (!iam) PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o); #endif /* Compute distributed A + A' */ if ((apat_mem_l = a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i, n, rowptr, colind, noDomains, vtxdist_o, &bnz, &b_rowptr, &b_colind, grid)) > 0) return (apat_mem_l); mem += -apat_mem_l; /* Initialize and allocate storage for parMetis. */ (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes."); l_sizes = *sizes; (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep."); l_fstVtxSep = *fstVtxSep; m_loc = vtxdist_o[iam+1] - vtxdist_o[iam]; if ( iam < noDomains) /* dist_order_int is the perm returned by parMetis, distributed */ if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for dist_order_int."); /* ParMETIS represents the column pointers and row indices of * * the input matrix using integers. When SuperLU_DIST uses * * long int for the int_t type, then several supplementary * * copies need to be performed in order to call ParMETIS. */ #if defined (_LONGINT) l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int)); if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int."); /* Allocate storage */ if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]"); for (i = 0; i <= m_loc; i++) b_rowptr_int[i] = b_rowptr[i]; SUPERLU_FREE (b_rowptr); if ( bnz ) { if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for b_colind_int[]"); for (i = 0; i < bnz; i++) b_colind_int[i] = b_colind[i]; SUPERLU_FREE (b_colind); } if ( !(vtxdist_o_int = (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int)))) ABORT("SUPERLU_MALLOC fails for vtxdist_o_int."); for (i = 0; i <= nprocs_i; i++) vtxdist_o_int[i] = vtxdist_o[i]; SUPERLU_FREE (vtxdist_o); #else /* Default */ vtxdist_o_int = vtxdist_o; b_rowptr_int = b_rowptr; b_colind_int = b_colind; l_sizes_int = l_sizes; #endif if ( iam < noDomains) { options = (int *) SUPERLU_MALLOC(4 * sizeof(int)); options[0] = 0; options[1] = 0; options[2] = 0; options[3] = 1; ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, &numflag, options, dist_order_int, l_sizes_int, metis_comm); } if (bnz) SUPERLU_FREE (b_colind_int); if ( iam < noDomains) { SUPERLU_FREE (options); } SUPERLU_FREE (b_rowptr_int); #if defined (_LONGINT) /* Copy data from dist_order_int to dist_order */ if ( iam < noDomains) { /* dist_order is the perm returned by parMetis, distributed */ if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t)))) ABORT("SUPERLU_MALLOC fails for dist_order."); for (i = 0; i < m_loc; i++) dist_order[i] = dist_order_int[i]; SUPERLU_FREE(dist_order_int); for (i = 0; i < 2*noDomains; i++) l_sizes[i] = l_sizes_int[i]; SUPERLU_FREE(l_sizes_int); } #else dist_order = dist_order_int; #endif /* Allgatherv dist_order to get perm_c */ if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) ABORT ("SUPERLU_MALLOC fails for displs."); if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) ABORT ("SUPERLU_MALLOC fails for recvcnts."); for (i = 0; i < nprocs_i; i++) recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i]; displs[0]=0; for(i=1; i < nprocs_i; i++) displs[i] = displs[i-1] + recvcnts[i-1]; MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, mpi_int_t, grid->comm); if ( iam < noDomains) { SUPERLU_FREE (dist_order); } SUPERLU_FREE (vtxdist_i); SUPERLU_FREE (vtxdist_o_int); SUPERLU_FREE (recvcnts); SUPERLU_FREE (displs); /* send l_sizes to every processor p >= noDomains */ if (!iam) for (p = noDomains; p < nprocs_i; p++) MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm); if (noDomains <= iam && iam < nprocs_i) MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm, &status); /* Determine the first node in each separator, store it in l_fstVtxSep */ for (j = 0; j < 2 * noDomains; j++) l_fstVtxSep[j] = 0; l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2]; szSep = noDomains; i = 0; while (szSep != 1) { for (j = i; j < i + szSep; j++) { l_fstVtxSep[j] += l_sizes[j]; } for (j = i; j < i + szSep; j++) { k = i + szSep + (j-i) / 2; l_fstVtxSep[k] += l_fstVtxSep[j]; } i += szSep; szSep = szSep / 2; } l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2]; i = 2 * noDomains - 2; szSep = 1; while (i > 0) { for (j = i; j < i + szSep; j++) { k = (i - 2 * szSep) + (j-i) * 2 + 1; noNodes = l_fstVtxSep[k]; l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k]; l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - noNodes - l_sizes[k-1]; } szSep *= 2; i -= szSep; } #if ( PRNTlevel>=2 ) if (!iam ) { PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes); PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep); } #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()"); #endif return (-mem); } /* get_perm_c_parmetis */
static int f(realtype t, N_Vector u, N_Vector udot, void *user_data) { realtype uLeft, uRight, ui, ult, urt; realtype hordc, horac, hdiff, hadv; realtype *udata, *dudata; long int i, my_length; int npes, my_pe, my_pe_m1, my_pe_p1, last_pe, my_last; UserData data; MPI_Status status; MPI_Comm comm; /* Extract MPI info. from data */ data = (UserData) user_data; comm = data->comm; npes = data->npes; my_pe = data->my_pe; /* If this process is inactive, return now */ if (my_pe == npes) return(0); /* Extract problem constants from data */ hordc = data->hdcoef; horac = data->hacoef; /* Find related processes */ my_pe_m1 = my_pe - 1; my_pe_p1 = my_pe + 1; last_pe = npes - 1; /* Obtain local arrays */ udata = NV_DATA_P(u); dudata = NV_DATA_P(udot); my_length = NV_LOCLENGTH_P(u); my_last = my_length - 1; /* Pass needed data to processes before and after current process. */ if (my_pe != 0) MPI_Send(&udata[0], 1, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm); if (my_pe != last_pe) MPI_Send(&udata[my_length-1], 1, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm); /* Receive needed data from processes before and after current process. */ if (my_pe != 0) MPI_Recv(&uLeft, 1, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm, &status); else uLeft = ZERO; if (my_pe != last_pe) MPI_Recv(&uRight, 1, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm, &status); else uRight = ZERO; /* Loop over all grid points in current process. */ for (i=0; i<my_length; i++) { /* Extract u at x_i and two neighboring points */ ui = udata[i]; ult = (i==0) ? uLeft: udata[i-1]; urt = (i==my_length-1) ? uRight : udata[i+1]; /* Set diffusion and advection terms and load into udot */ hdiff = hordc*(ult - TWO*ui + urt); hadv = horac*(urt - ult); dudata[i] = hdiff + hadv; } return(0); }
int main(int argc, char *argv[]) { MPI_Status status; int num, rank, size, tag, next, from; if (argc != 2) { printf("appel : nom du programme nbre de tours \n"); exit(-1); } /* Start up MPI */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); /* Arbitrarily choose 201 to be our tag. Calculate the */ /* rank of the next process in the ring. Use the modulus */ /* operator so that the last process "wraps around" to rank */ /* zero. */ tag = 201; next = (rank + 1) % size; from = (rank + size - 1) % size; /* If we are the "console" process, get a integer from the */ /* user to specify how many times we want to go around the */ /* ring */ if (rank == 0) { num = atoi (argv[1]); printf("Process %d sending %d to %d\n", rank, num, next); MPI_Send(&num, 1, MPI_INT, next, tag, MPI_COMM_WORLD); } /* Pass the message around the ring. The exit mechanism works */ /* as follows: the message (a positive integer) is passed */ /* around the ring. */ while (1) { MPI_Recv(&num, 1, MPI_INT, from, tag, MPI_COMM_WORLD, &status); printf("Process %d received %d\n", rank, num); if (rank == 0) { num--; printf("Process 0 decremented num\n"); } if (num == 0) { printf("Process %d exiting\n", rank); break; } printf("Process %d sending %d to %d\n", rank, num, next); MPI_Send(&num, 1, MPI_INT, next, tag, MPI_COMM_WORLD); } /* The last process does one extra send to process 0, which needs */ /* to be received before the program can exit */ if (rank == 0) MPI_Send(&num, 1, MPI_INT, next, tag, MPI_COMM_WORLD); /* Quit */ MPI_Finalize(); return 0; }
static void PrintOutput(realtype g_val, N_Vector uB, UserData data) { MPI_Comm comm; MPI_Status status; int npes, my_pe; long int i, Ni, indx, local_N, nperpe, nrem; realtype *uBdata; realtype *mu; comm = data->comm; npes = data->npes; my_pe = data->my_pe; local_N = data->local_N; nperpe = data->nperpe; nrem = data->nrem; uBdata = NV_DATA_P(uB); if (my_pe == npes) { #if defined(SUNDIALS_EXTENDED_PRECISION) printf("\ng(tf) = %8Le\n\n", g_val); printf("dgdp(tf)\n [ 1]: %8Le\n [ 2]: %8Le\n\n", -uBdata[0], -uBdata[1]); #elif defined(SUNDIALS_DOUBLE_PRECISION) printf("\ng(tf) = %8le\n\n", g_val); printf("dgdp(tf)\n [ 1]: %8le\n [ 2]: %8le\n\n", -uBdata[0], -uBdata[1]); #else printf("\ng(tf) = %8e\n\n", g_val); printf("dgdp(tf)\n [ 1]: %8e\n [ 2]: %8e\n\n", -uBdata[0], -uBdata[1]); #endif mu = (realtype *)malloc(NEQ*sizeof(realtype)); if (check_flag((void *)mu, "malloc", 2, my_pe)) MPI_Abort(comm, 1); indx = 0; for ( i = 0; i < npes; i++) { Ni = ( i < nrem ) ? nperpe+1 : nperpe; MPI_Recv(&mu[indx], Ni, PVEC_REAL_MPI_TYPE, i, 0, comm, &status); indx += Ni; } printf("mu(t0)\n"); #if defined(SUNDIALS_EXTENDED_PRECISION) for (i=0; i<NEQ; i++) printf(" [%2ld]: %8Le\n", i+1, mu[i]); #elif defined(SUNDIALS_DOUBLE_PRECISION) for (i=0; i<NEQ; i++) printf(" [%2ld]: %8le\n", i+1, mu[i]); #else for (i=0; i<NEQ; i++) printf(" [%2ld]: %8e\n", i+1, mu[i]); #endif free(mu); } else { MPI_Send(uBdata, local_N, PVEC_REAL_MPI_TYPE, npes, 0, comm); } }
inline void MyMPI_Send (const string & s, int dest) { MPI_Send( const_cast<char*> (s.c_str()), s.length(), MPI_CHAR, dest, 1, MPI_COMM_WORLD); }