inline void MyMPI_Send (FlatArray<T, BASE> s, int dest)
 {
   MPI_Send( &s.First(), s.Size(), MyGetMPIType<T>(), dest, 1, MPI_COMM_WORLD);
 }
Example #2
0
File: mtr.c Project: ZJLi2013/petsc
/*@C
    PetscMallocDumpLog - Dumps the log of all calls to PetscMalloc(); also calls
       PetscMemoryGetMaximumUsage()

    Collective on PETSC_COMM_WORLD

    Input Parameter:
.   fp - file pointer; or NULL

    Options Database Key:
.  -malloc_log - Activates PetscMallocDumpLog()

    Level: advanced

   Fortran Note:
   The calling sequence in Fortran is PetscMallocDumpLog(integer ierr)
   The fp defaults to stdout.

.seealso: PetscMallocGetCurrentUsage(), PetscMallocDump(), PetscMallocSetDumpLog()
@*/
PetscErrorCode  PetscMallocDumpLog(FILE *fp)
{
  PetscInt       i,j,n,dummy,*perm;
  size_t         *shortlength;
  int            *shortcount,err;
  PetscMPIInt    rank,size,tag = 1212 /* very bad programming */;
  PetscBool      match;
  const char     **shortfunction;
  PetscLogDouble rss;
  MPI_Status     status;
  PetscErrorCode ierr;

  PetscFunctionBegin;
  ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRQ(ierr);
  ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRQ(ierr);
  /*
       Try to get the data printed in order by processor. This will only sometimes work
  */
  err = fflush(fp);
  if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file");

  ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
  if (rank) {
    ierr = MPI_Recv(&dummy,1,MPIU_INT,rank-1,tag,MPI_COMM_WORLD,&status);CHKERRQ(ierr);
  }

  if (PetscLogMalloc < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"PetscMallocDumpLog() called without call to PetscMallocSetDumpLog() this is often due to\n                      setting the option -malloc_log AFTER PetscInitialize() with PetscOptionsInsert() or PetscOptionsInsertFile()");

  if (!fp) fp = PETSC_STDOUT;
  ierr = PetscMemoryGetMaximumUsage(&rss);CHKERRQ(ierr);
  if (rss) {
    ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] Maximum memory PetscMalloc()ed %.0f maximum size of entire process %.0f\n",rank,(PetscLogDouble)TRMaxMem,rss);CHKERRQ(ierr);
  } else {
    ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] Maximum memory PetscMalloc()ed %.0f OS cannot compute size of entire process\n",rank,(PetscLogDouble)TRMaxMem);CHKERRQ(ierr);
  }
  shortcount    = (int*)malloc(PetscLogMalloc*sizeof(int));if (!shortcount) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory");
  shortlength   = (size_t*)malloc(PetscLogMalloc*sizeof(size_t));if (!shortlength) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory");
  shortfunction = (const char**)malloc(PetscLogMalloc*sizeof(char*));if (!shortfunction) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory");
  for (i=0,n=0; i<PetscLogMalloc; i++) {
    for (j=0; j<n; j++) {
      ierr = PetscStrcmp(shortfunction[j],PetscLogMallocFunction[i],&match);CHKERRQ(ierr);
      if (match) {
        shortlength[j] += PetscLogMallocLength[i];
        shortcount[j]++;
        goto foundit;
      }
    }
    shortfunction[n] = PetscLogMallocFunction[i];
    shortlength[n]   = PetscLogMallocLength[i];
    shortcount[n]    = 1;
    n++;
foundit:;
  }

  perm = (PetscInt*)malloc(n*sizeof(PetscInt));if (!perm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"Out of memory");
  for (i=0; i<n; i++) perm[i] = i;
  ierr = PetscSortStrWithPermutation(n,(const char**)shortfunction,perm);CHKERRQ(ierr);

  ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] Memory usage sorted by function\n",rank);CHKERRQ(ierr);
  for (i=0; i<n; i++) {
    ierr = PetscFPrintf(MPI_COMM_WORLD,fp,"[%d] %d %.0f %s()\n",rank,shortcount[perm[i]],(PetscLogDouble)shortlength[perm[i]],shortfunction[perm[i]]);CHKERRQ(ierr);
  }
  free(perm);
  free(shortlength);
  free(shortcount);
  free((char**)shortfunction);
  err = fflush(fp);
  if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file");
  if (rank != size-1) {
    ierr = MPI_Send(&dummy,1,MPIU_INT,rank+1,tag,MPI_COMM_WORLD);CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
static int fB(realtype t, N_Vector u, 
              N_Vector uB, N_Vector uBdot, void *user_dataB)
{
  realtype *uBdata, *duBdata, *udata;
  realtype uBLeft, uBRight, uBi, uBlt, uBrt;
  realtype uLeft, uRight, ui, ult, urt;
  realtype dx, hordc, horac, hdiff, hadv;
  realtype *z1, *z2, intgr1, intgr2;
  long int i, my_length;
  int npes, my_pe, my_pe_m1, my_pe_p1, last_pe, my_last;
  UserData data;
  realtype data_in[2], data_out[2];
  MPI_Status status;
  MPI_Comm comm;

  /* Extract MPI info. from data */
  data = (UserData) user_dataB;
  comm = data->comm;
  npes = data->npes;
  my_pe = data->my_pe;

  if (my_pe == npes) { /* This process performs the quadratures */

    /* Obtain local arrays */
    duBdata = NV_DATA_P(uBdot);
    my_length = NV_LOCLENGTH_P(uB);

    /* Loop over all other processes and load right hand side of quadrature eqs. */
    duBdata[0] = ZERO;
    duBdata[1] = ZERO;
    for (i=0; i<npes; i++) {
      MPI_Recv(&intgr1, 1, PVEC_REAL_MPI_TYPE, i, 0, comm, &status); 
      duBdata[0] += intgr1;
      MPI_Recv(&intgr2, 1, PVEC_REAL_MPI_TYPE, i, 0, comm, &status); 
      duBdata[1] += intgr2;
    }

  } else { /* This process integrates part of the PDE */

    /* Extract problem constants and work arrays from data */
    dx    = data->dx;
    hordc = data->hdcoef;
    horac = data->hacoef;
    z1    = data->z1;
    z2    = data->z2;

    /* Obtain local arrays */
    uBdata = NV_DATA_P(uB);
    duBdata = NV_DATA_P(uBdot);
    udata = NV_DATA_P(u);
    my_length = NV_LOCLENGTH_P(uB);

    /* Compute related parameters. */
    my_pe_m1 = my_pe - 1;
    my_pe_p1 = my_pe + 1;
    last_pe  = npes - 1;
    my_last  = my_length - 1;

    /* Pass needed data to processes before and after current process. */
    if (my_pe != 0) {
      data_out[0] = udata[0];
      data_out[1] = uBdata[0];
    
      MPI_Send(data_out, 2, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm);
    }
    if (my_pe != last_pe) {
      data_out[0] = udata[my_length-1];
      data_out[1] = uBdata[my_length-1];

      MPI_Send(data_out, 2, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm);
    }
    
    /* Receive needed data from processes before and after current process. */
    if (my_pe != 0) {
      MPI_Recv(data_in, 2, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm, &status);
      
      uLeft = data_in[0];
      uBLeft = data_in[1];
    } else {
      uLeft = ZERO;
      uBLeft = ZERO;
    }
    if (my_pe != last_pe) {
      MPI_Recv(data_in, 2, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm, &status);

      uRight = data_in[0];
      uBRight = data_in[1];
    } else {
      uRight = ZERO;
      uBRight = ZERO;
    }

    /* Loop over all grid points in current process. */
    for (i=0; i<my_length; i++) {
      
      /* Extract uB at x_i and two neighboring points */
      uBi = uBdata[i];
      uBlt = (i==0) ? uBLeft: uBdata[i-1];
      uBrt = (i==my_length-1) ? uBRight : uBdata[i+1];
      
      /* Set diffusion and advection terms and load into udot */
      hdiff = hordc*(uBlt - TWO*uBi + uBrt);
      hadv = horac*(uBrt - uBlt);
      duBdata[i] = - hdiff + hadv;

      /* Extract u at x_i and two neighboring points */
      ui = udata[i];
      ult = (i==0) ? uLeft: udata[i-1];
      urt = (i==my_length-1) ? uRight : udata[i+1];

      /* Load integrands of the two space integrals */
      z1[i] = uBdata[i]*(ult - TWO*ui + urt)/(dx*dx);
      z2[i] = uBdata[i]*(urt - ult)/(TWO*dx);
    }

    /* Compute local integrals */
    intgr1 = Xintgr(z1, my_length, dx);
    intgr2 = Xintgr(z2, my_length, dx);

    /* Send local integrals to 'quadrature' process */
    MPI_Send(&intgr1, 1, PVEC_REAL_MPI_TYPE, npes, 0, comm);
    MPI_Send(&intgr2, 1, PVEC_REAL_MPI_TYPE, npes, 0, comm);

  }


  return(0);
}
Example #4
0
void send_values(double *v, int n, int child_id)
{
    MPI_Send(&n, 1, MPI_INT, child_id, tag, MPI_COMM_WORLD);
    MPI_Send(v, n, MPI_DOUBLE, child_id, tag, MPI_COMM_WORLD);
}
Example #5
0
// ===========================================================================
// matrix_mult_mpi()            MPI version of dense matrix multiplication.
//                              Processor cores are laid out in a 2D space
//                              core_rows * core_cols layout. Each core
//                              has a portion of matrices A and B (each such
//                              portion blk_size * blk_size numbers) and
//                              computes a portion of matrix C = A * B.
// ===========================================================================
// * INPUTS
//   int num_procs              Number of processors to use from the MPI setup
//   int tile_size              Number of core rows and columns in the 2D layout
//   int blk_size               Number of rows and columns for each portion.
//                              The total size of A, B and C is
//                              tile_size ^ 2 * blk_size ^ 2 elements.
//   int verify                 0: don't verify the multiplication
//                              1: core 0 gathers all matrices back and
//                                 does the verification
//
// * RETURN VALUE
//   int                        0 for success
// ===========================================================================
int matrix_mult_mpi(int num_procs, int tile_size, int blk_size, int verify) {

  int                   num_cores;
  int                   rank;
  int                   tile_row;
  int                   tile_col;
  int                   blk_size_sq = 0;
  int                   whole_size = 0;
  int                   whole_size_sq = 0;
  int                   phase;
  int                   tag_a;
  int                   tag_b;
  int                   tag_c;
  MPI_Request           reqs[2];
  int                   num_reqs;
  MPI_Status            status;
  float                 *a = NULL;
  float                 *b = NULL;
  float                 *c = NULL;
  int                   peer_rank;
  float                 *peer_a = NULL;
  float                 *peer_b = NULL;
  float                 *work_a = NULL;
  float                 *work_b = NULL;
  float                 *whole_a = NULL;
  float                 *whole_b = NULL;
  float                 *whole_c = NULL;
  float                 *peer_c = NULL;
  float                 verify_val;
#ifdef ARCH_MB
  unsigned int          time_start = 0;
  unsigned int          time_stop;
  unsigned int          time;
#endif
  int                   i;
  int                   j;
  int                   k;


  // Initialize MPI
  //MPI_Init(NULL, NULL);
 
  // Who are we?
  MPI_Comm_size(MPI_COMM_WORLD, &num_cores);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  // Sanity checks
  if (num_cores < num_procs) {
    if (!rank) {
      kt_printf("Cannot run with %d cores, MPI setup has only %d cores\r\n",
                num_procs, num_cores);
    }
    return 1;
  }
  if (tile_size * tile_size != num_procs) {
    if (!rank) {
      kt_printf("%d * %d tiles != %d cores\r\n", 
                tile_size, tile_size, num_procs);
    }
    return 1;
  }
  rank_to_tile(tile_size, rank, &tile_row, &tile_col);


  // Are we running?
  if (rank < num_procs) {

    // Create array portions
    blk_size_sq = blk_size * blk_size;
    whole_size = tile_size * blk_size;
    whole_size_sq = whole_size * whole_size;
    a = kt_malloc(blk_size_sq * sizeof(float));
    b = kt_malloc(blk_size_sq * sizeof(float));
    c = kt_malloc(blk_size_sq * sizeof(float));

    // Initialize our portions of A and B with some values, zero out C
    for (i = 0; i < blk_size_sq; i++) {
      a[i] = rank + i;
      b[i] = rank - i;
      c[i] = 0.0;
    }

    // Create two buffers to receive peer A and B portions
    peer_a = kt_malloc(blk_size_sq * sizeof(float));
    peer_b = kt_malloc(blk_size_sq * sizeof(float));

    // Assign tags for A and B send/recvs
    tag_a = 42;
    tag_b = 666;
    tag_c = 3;
  }

  // Synchronize everyone
  if (!rank) {
    kt_printf("Matrix multiplication of %d x %d starting on %d core(s)\r\n", 
              whole_size, whole_size, num_procs);
  }
  MPI_Barrier(MPI_COMM_WORLD);

  // Keep time
  if (!rank) {
#ifdef ARCH_MB
    time_start = ar_glob_timer_read();
#endif
  }
  
  // If not part of active cores, skip to next barrier
  if (rank >= num_procs) {
    goto skip;
  }


  // For all the phases in the algorithm
  for (phase = 0; phase < tile_size; phase++) {

    // Remember how many waits we'll have to do
    num_reqs = 0;

    // Are we responsible to broadcast our A portion?
    if (tile_col == phase) {
      // Broadcast to others in our tile row
      for (i = 0; i < tile_size; i++) {
        if (i != tile_col) {
          peer_rank = tile_to_rank(tile_size, tile_row, i);
          MPI_Isend(a, blk_size_sq, MPI_FLOAT, peer_rank, tag_a, 
                    MPI_COMM_WORLD, NULL);
        }
      }
      work_a = a;
    }
    else {
      // Receive A portion from someone else from our tile row
      peer_rank = tile_to_rank(tile_size, tile_row, phase);
      MPI_Irecv(peer_a, blk_size_sq, MPI_FLOAT, peer_rank, tag_a, 
                MPI_COMM_WORLD, &reqs[num_reqs++]);
      work_a = peer_a;
    }


    // Are we responsible to broadcast our B portion?
    if (tile_row == phase) {
      // Broadcast to others in our tile col
      for (i = 0; i < tile_size; i++) {
        if (i != tile_row) {
          peer_rank = tile_to_rank(tile_size, i, tile_col);
          MPI_Isend(b, blk_size_sq, MPI_FLOAT, peer_rank, tag_b, 
                    MPI_COMM_WORLD, NULL);
        }
      }
      work_b = b;
    }
    else {
      // Receive B portion from someone else from our tile col
      peer_rank = tile_to_rank(tile_size, phase, tile_col);
      MPI_Irecv(peer_b, blk_size_sq, MPI_FLOAT, peer_rank, tag_b, 
                MPI_COMM_WORLD, &reqs[num_reqs++]);
      work_b = peer_b;
    }


    // Wait for needed receives to arrive
    if (num_reqs) {
      MPI_Waitall(num_reqs, reqs, &status);
    }

    // Add to partial results of C the A * B portion sums
    for (i = 0; i < blk_size; i++) {
      for (j = 0; j < blk_size; j++) {
        for (k = 0; k < blk_size; k++) {
          c[i * blk_size + j] += work_a[i * blk_size + k] * 
                                 work_b[k * blk_size + j];
        }
      }
    }
  }


  // Synchronize
skip:
  MPI_Barrier(MPI_COMM_WORLD);

  // Keep time
  if (!rank) {
#ifdef ARCH_MB
    time_stop = ar_glob_timer_read();
    if (time_stop > time_start) {
      time = time_stop - time_start;
    }
    else {
      time = 0xFFFFFFFF - (time_start - time_stop);
    }
    kt_printf("Time: %10u cycles (%6u msec)\r\n", time, time / 10000);
#endif
  }

  if (!verify) {
    goto finished;
  }

  // Rank 0 gathers all A, B and C's and verifies
  if (rank == 0) {

    kt_printf("Multiplication finished, rank 0 is gathering results...\r\n");

    // Allocate big arrays
    whole_a = kt_malloc(whole_size_sq * sizeof(float));
    whole_b = kt_malloc(whole_size_sq * sizeof(float));
    whole_c = kt_malloc(whole_size_sq * sizeof(float));
    
    // Allocate partial C buffer
    peer_c = kt_malloc(blk_size_sq * sizeof(float));

    // Place my partial arrays
    matrix_mult_mpi_place_partial(whole_a, (float *) a, tile_size, blk_size, 0);
    matrix_mult_mpi_place_partial(whole_b, (float *) b, tile_size, blk_size, 0);
    matrix_mult_mpi_place_partial(whole_c, (float *) c, tile_size, blk_size, 0);

    // Gather from others
    for (peer_rank = 1; peer_rank < num_procs; peer_rank++) {
      MPI_Recv(peer_a, blk_size_sq, MPI_FLOAT, peer_rank, tag_a, 
               MPI_COMM_WORLD, &status);
      MPI_Recv(peer_b, blk_size_sq, MPI_FLOAT, peer_rank, tag_b, 
               MPI_COMM_WORLD, &status);
      MPI_Recv(peer_c, blk_size_sq, MPI_FLOAT, peer_rank, tag_c, 
               MPI_COMM_WORLD, &status);

      matrix_mult_mpi_place_partial(whole_a, (float *) peer_a, tile_size, 
                                    blk_size, peer_rank);
      matrix_mult_mpi_place_partial(whole_b, (float *) peer_b, tile_size, 
                                    blk_size, peer_rank);
      matrix_mult_mpi_place_partial(whole_c, (float *) peer_c, tile_size, 
                                    blk_size, peer_rank);
    }

    // Print
    //kt_printf("A is:\r\n");
    //matrix_mult_mpi_print_matrix(whole_a, whole_size);
    //kt_printf("\r\nB is:\r\n");
    //matrix_mult_mpi_print_matrix(whole_b, whole_size);
    //kt_printf("\r\nC is:\r\n");
    //matrix_mult_mpi_print_matrix(whole_c, whole_size);
    //kt_printf("\r\n");

    // Verify
    for (i = 0; i < whole_size; i++) {
      for (j = 0; j < whole_size; j++) {
        verify_val = 0;
        for (k = 0; k < whole_size; k++) {
          verify_val += whole_a[i * whole_size + k] * 
                        whole_b[k * whole_size + j];
        }
        if (whole_c[i * whole_size + j] != verify_val) {
          kt_printf("Results gathered: "
                    "Verification FAILED at C[%d, %d]\r\n", i, j);
          while (1) {
            ;
          }
        }
      }
    }
    kt_printf("Results gathered. Verification PASSED\r\n");
  }
  else if (rank < num_procs) {
    // Send partial arrays to rank 0
    MPI_Send(a, blk_size_sq, MPI_FLOAT, 0, tag_a, MPI_COMM_WORLD);
    MPI_Send(b, blk_size_sq, MPI_FLOAT, 0, tag_b, MPI_COMM_WORLD);
    MPI_Send(c, blk_size_sq, MPI_FLOAT, 0, tag_c, MPI_COMM_WORLD);
  }

finished:

  // Free stuff
  if (rank < num_procs) {
    kt_free(a);
    kt_free(b);
    kt_free(c);
    kt_free(peer_a);
    kt_free(peer_b);
    if (verify) {
      kt_free(whole_a);
      kt_free(whole_b);
      kt_free(whole_c);
      kt_free(peer_c);
    }
  }

  return 0;
}
Example #6
0
int main(int argc, char *argv[])
{
	int rank;
	int n_ranks, start_rank;
	int i,j;
	float gamma = 0.25, rho = -0.495266;
	float GLOB_SUM = 0, sum = 0;

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	printf("before get data in id %d\n", rank);

	get_data(rank%4);
	start_rank = 6;
	n_ranks = 4;

	printf("done getting dat rank %d\n", rank);

	MPI_Barrier(MPI_COMM_WORLD);
//	printf("crossing bar1 %d\n", rank);
	
	for (j = 0; j < INPUT_SIZE; ++j)
	{
		get_input(rank, start_rank, n_ranks);
		sum = compute_svm_sum(rank%4, gamma);
		if(rank == start_rank)
		{
			float tempBuff;
			GLOB_SUM = sum;
			for (i = start_rank+1; i < start_rank + n_ranks; ++i) {
				MPI_Recv(&tempBuff, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
				GLOB_SUM = GLOB_SUM + tempBuff;
			}
			GLOB_SUM -= rho;
		}
		else {
			MPI_Send((float*)&sum, 1, MPI_FLOAT, start_rank, 0, MPI_COMM_WORLD);
		}
	}

	//if(rank != 6)
	//printf("before bar2 %d\n", rank);

	MPI_Barrier(MPI_COMM_WORLD);
	if(rank == 6)
	{
		#ifdef DUMP
			m5_dump_stats(0, 0);
			m5_reset_stats(0, 0);
		#endif
	}

	//printf("done with thread %d\n", rank);

	if(rank == 6)	
		printf("global sum = %f\n", GLOB_SUM);
//	free_data();
	MPI_Finalize();
	return 0;
}
Example #7
0
PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
{
  KSP_AGMRES     *agmres = (KSP_AGMRES*) ksp->data;
  MPI_Comm       comm;
  PetscScalar    *Qloc   = agmres->Qloc;
  PetscScalar    *sgn    = agmres->sgn;
  PetscScalar    *tloc   = agmres->tloc;
  PetscErrorCode ierr;
  PetscReal      *wbufptr = agmres->wbufptr;
  PetscMPIInt    rank     = agmres->rank;
  PetscMPIInt    First    = agmres->First;
  PetscMPIInt    Last     = agmres->Last;
  PetscBLASInt   pas,len,bnloc,bpos;
  PetscInt       nloc,d, i, j, k;
  PetscInt       pos;
  PetscReal      c, s, rho, Ajj, val, tt, old;
  PetscScalar    *col;
  MPI_Status     status;
  PetscBLASInt   N = MAXKSPSIZE + 1;


  PetscFunctionBegin;
  ierr = PetscObjectGetComm((PetscObject)ksp,&comm);CHKERRQ(ierr);
  ierr = PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);CHKERRQ(ierr);
  ierr = PetscMemzero(agmres->Rloc, N*N*sizeof(PetscScalar));CHKERRQ(ierr);
  /* check input arguments */
  if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive");
  ierr = VecGetLocalSize(VEC_V(0), &nloc);CHKERRQ(ierr);
  ierr = PetscBLASIntCast(nloc,&bnloc);CHKERRQ(ierr);
  if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
  pas = 1;
  /* Copy the vectors of the basis */
  for (j = 0; j < nvec; j++) {
    ierr = VecGetArray(VEC_V(j), &col);CHKERRQ(ierr);
    PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
    ierr = VecRestoreArray(VEC_V(j), &col);CHKERRQ(ierr);
  }
  /* Each process performs a local QR on its own block */
  for (j = 0; j < nvec; j++) {
    len = nloc - j;
    Ajj = Qloc[j*nloc+j];
    rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas);
    if (rho == 0.0) tloc[j] = 0.0;
    else {
      tloc[j] = (Ajj - rho) / rho;
      len     = len - 1;
      val     = 1.0 / (Ajj - rho);
      PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
      Qloc[j*nloc+j] = 1.0;
      len            = len + 1;
      for (k = j + 1; k < nvec; k++) {
        PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
        PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
      }
      Qloc[j*nloc+j] = rho;
    }
  }
  /*annihilate undesirable Rloc, diagonal by diagonal*/
  for (d = 0; d < nvec; d++) {
    len = nvec - d;
    if (rank == First) {
      PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
      ierr = MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);CHKERRQ(ierr);
    } else {
      ierr = MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);CHKERRQ(ierr);
      /*Elimination of Rloc(1,d)*/
      c    = wbufptr[d];
      s    = Qloc[d*nloc];
      ierr = KSPAGMRESRoddecGivens(&c, &s, &rho, 1);CHKERRQ(ierr);
      /*Apply Givens Rotation*/
      for (k = d; k < nvec; k++) {
        old          = wbufptr[k];
        wbufptr[k]   =  c * old - s * Qloc[k*nloc];
        Qloc[k*nloc] =  s * old + c * Qloc[k*nloc];
      }
      Qloc[d*nloc] = rho;
      if (rank != Last) {
        ierr = MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);CHKERRQ(ierr);
      }
      /* zero-out the d-th diagonal of Rloc ...*/
      for (j = d + 1; j < nvec; j++) {
        /* elimination of Rloc[i][j]*/
        i    = j - d;
        c    = Qloc[j*nloc+i-1];
        s    = Qloc[j*nloc+i];
        ierr = KSPAGMRESRoddecGivens(&c, &s, &rho, 1);CHKERRQ(ierr);
        for (k = j; k < nvec; k++) {
          old              = Qloc[k*nloc+i-1];
          Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
          Qloc[k*nloc+i]   =   s * old + c * Qloc[k*nloc+i];
        }
        Qloc[j*nloc+i] = rho;
      }
      if (rank == Last) {
        PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
        for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
      }
    }
  }

  if (rank == Last) {
    for (d = 0; d < nvec; d++) {
      pos    = nvec - d;
      ierr = PetscBLASIntCast(pos,&bpos);CHKERRQ(ierr);
      sgn[d] = PetscSign(*RLOC(d,d));
      PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
    }
  }
  /*BroadCast Rloc to all other processes
   * NWD : should not be needed
   */
  ierr = MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);CHKERRQ(ierr);
  ierr = PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
void mpi_distribute(int Mx){
  if ( taskid == MASTER ) {
    averow    =   Mx/numworkers;
    extra     =   Mx%numworkers;
    offset    =   0;
    for ( rank=1; rank <= (numworkers); rank++) {
      rows         =   (rank <= extra) ? averow+1 : averow;
      left_node    =   rank - 1;
      right_node   =   rank + 1;

      if ( rank == 1 ) {
        left_node  = NONE;
      }
      if ( rank == (numworkers) ) {
        right_node = NONE;
      }

      dest = rank;

      MPI_Send(&offset,               1,                   MPI_INT,         dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&rows,                 1,                   MPI_INT,         dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&left_node,            1,                   MPI_INT,         dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&right_node,           1,                   MPI_INT,         dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&phi_old[offset*Mx],      rows*Mx,          MPI_DOUBLE,      dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&mu_old[offset*Mx],       rows*Mx,          MPI_DOUBLE,      dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&u_old[offset*Mx],       rows*Mx,          MPI_DOUBLE,      dest,   BEGIN,  MPI_COMM_WORLD);
      MPI_Send(&v_old[offset*Mx],       rows*Mx,          MPI_DOUBLE,      dest,   BEGIN,  MPI_COMM_WORLD);
      offset = offset + rows;
    }
  }else{
    source =  MASTER;
    MPI_Recv(&offset,        1,      MPI_INT,     source,    BEGIN,   MPI_COMM_WORLD,  &status);
    MPI_Recv(&rows,          1,      MPI_INT,     source,    BEGIN,   MPI_COMM_WORLD,  &status);
    MPI_Recv(&left_node,     1,      MPI_INT,     source,    BEGIN,   MPI_COMM_WORLD,  &status);
    MPI_Recv(&right_node,    1,      MPI_INT,     source,    BEGIN,   MPI_COMM_WORLD,  &status);

    start = 1;
    if((taskid ==1) || (taskid == numworkers)) {
      if(taskid == 1) {
        MPI_Recv(&phi_old[0],   rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
        MPI_Recv(&mu_old[0],    rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
#ifdef FLUID
        MPI_Recv(&u_old[0],   rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
        MPI_Recv(&v_old[0],    rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
#endif
      }
      else {
        MPI_Recv(&phi_old[Mx],  rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
        MPI_Recv(&mu_old[Mx],   rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
#ifdef FLUID
        MPI_Recv(&u_old[Mx],  rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
        MPI_Recv(&v_old[Mx],   rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
#endif
      }
      end = rows-1;
    } else {
      MPI_Recv(&phi_old[Mx],    rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
      MPI_Recv(&mu_old[Mx],     rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
#ifdef FLUID
      MPI_Recv(&u_old[Mx],    rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
      MPI_Recv(&v_old[Mx],     rows*Mx,          MPI_DOUBLE,      source,   BEGIN,  MPI_COMM_WORLD, &status);
#endif
      end = rows;
    }
  }
}
Example #9
0
void OUTPUT_ADAM_STATS(ElementsHashTable* El_Table, MatProps* matprops_ptr, TimeProps* timeprops_ptr, StatProps* statprops_ptr)
{
    int myid, numprocs;
    IF_MPI(MPI_Status status);
    
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    
    double velocity2 = 0.0;
    double vmax = 0, hmax = 0;
    double xy_cen[2] =
    { 0.0, 0.0 }, vh_cen[2] =
    { 0.0, 0.0 };
    double xyh_vmax[3] =
    { 0.0, 0.0, 0.0 };
    double xyv_hmax[3] =
    { 0.0, 0.0, 0.0 };
    double masscenterdist2 = 0.0, masscentermindist2 = HUGE_VAL, xycen[2] =
    { 0.0, 0.0 };
    double vmax_min_height = matprops_ptr->scale.max_negligible_height * 512.0 * ADAM_HEIGHT_FRAC;
    int i;
    struct
    { //for use with MPI_MAXLOC
        double val;
        int rank;
    } send, receive;
    
    xy_cen[0] = statprops_ptr->xcen / (matprops_ptr->scale.length);
    xy_cen[1] = statprops_ptr->ycen / (matprops_ptr->scale.length);
    
    double VxVy[2];
    
    int no_of_buckets = El_Table->get_no_of_buckets();
    vector<HashEntryLine> &bucket=El_Table->bucket;
    tivector<Element> &elenode_=El_Table->elenode_;
    
    //@ElementsBucketDoubleLoop
    for(int ibuck = 0; ibuck < no_of_buckets; ibuck++)
    {
        for(int ielm = 0; ielm < bucket[ibuck].ndx.size(); ielm++)
        {
            Element* EmTemp = &(elenode_[bucket[ibuck].ndx[ielm]]);
            if(EmTemp->adapted_flag() > 0)
            {
                double height=EmTemp->state_vars(0);
                EmTemp->eval_velocity(0.0, 0.0, VxVy);
                velocity2 = VxVy[0] * VxVy[0] + VxVy[1] * VxVy[1];

                //get v and h at center of mass
                masscenterdist2 = (EmTemp->coord(0) - xy_cen[0]) * (EmTemp->coord(0) - xy_cen[0])
                        + (EmTemp->coord(1) - xy_cen[1]) * (EmTemp->coord(1) - xy_cen[1]);
                if(masscenterdist2 < masscentermindist2)
                {
                    masscentermindist2 = masscenterdist2;
                    vh_cen[0] = velocity2;
                    vh_cen[1] = height;
                    xycen[0] = EmTemp->coord(0);
                    xycen[1] = EmTemp->coord(1);
                }

                //eliminate fast moving very thin pile from consideration
                if(height >= vmax_min_height)
                {

                    if(velocity2 > vmax)
                    {
                        /* velocity2 is not a mistake... only need to take the root of 
                         the maximum value */
                        vmax = velocity2;

                        xyh_vmax[0] = EmTemp->coord(0);
                        xyh_vmax[1] = EmTemp->coord(1);
                        xyh_vmax[2] = height;
                    }
                }

                if(height > hmax)
                {

                    hmax = height;

                    xyv_hmax[0] = EmTemp->coord(0);
                    xyv_hmax[1] = EmTemp->coord(1);
                    xyv_hmax[2] = velocity2;

                }
            }
        }
    }
    
    vh_cen[0] = sqrt(vh_cen[0]);
    vmax = sqrt(vmax);
    xyv_hmax[2] = sqrt(xyv_hmax[2]);
    
    /* get the max value accross all processors */
#ifdef USE_MPI
    if(numprocs > 1)
    {
        send.rank = myid;
        
        //at center of mass
        send.val = masscentermindist2;
        MPI_Allreduce(&send, &receive, 1, MPI_DOUBLE_INT, MPI_MINLOC, MPI_COMM_WORLD);
        if(receive.rank != 0)
        { /* don't send location if it's already on the 
         root processor */
            if(receive.rank == myid)
                MPI_Send(vh_cen, 2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
            else if(myid == 0)
                MPI_Recv(vh_cen, 2, MPI_DOUBLE, receive.rank, 0, MPI_COMM_WORLD, &status);
        }
        
        //at location of vmax
        send.val = vmax;
        MPI_Allreduce(&send, &receive, 1, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD);
        vmax = receive.val;
        
        if(receive.rank != 0)
        { /* don't send location if it's already on the 
         root processor */
            if(receive.rank == myid)
                MPI_Send(xyh_vmax, 3, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
            else if(myid == 0)
                MPI_Recv(xyh_vmax, 3, MPI_DOUBLE, receive.rank, 0, MPI_COMM_WORLD, &status);
        }
        
        //at location of hmax
        send.val = hmax;
        MPI_Allreduce(&send, &receive, 1, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD);
        hmax = receive.val;
        
        if(receive.rank != 0)
        { /* don't send location if it's already on the 
         root processor */
            if(receive.rank == myid)
                MPI_Send(xyv_hmax, 3, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
            else if(myid == 0)
                MPI_Recv(xyv_hmax, 3, MPI_DOUBLE, receive.rank, 0, MPI_COMM_WORLD, &status);
        }
    }
#endif //USE_MPI
    
    if(myid == 0)
    {
        FILE *fp = fopen("flow_dynamics.stats", "a");
        
        fprintf(fp,
                "%16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g,   %16.10g\n",
                timeprops_ptr->timesec(), //time in seconds
                
                statprops_ptr->vmean, //average velocity
                
                //x,y,v,h at center of mass
                statprops_ptr->xcen,
                statprops_ptr->ycen,
                vh_cen[0] * sqrt(matprops_ptr->scale.length * (matprops_ptr->scale.gravity)),
                vh_cen[1] * (matprops_ptr->scale.height),
                
                //x,y,v,h at location of vmax
                xyh_vmax[0] * matprops_ptr->scale.length,
                xyh_vmax[1] * matprops_ptr->scale.length,
                vmax * sqrt(matprops_ptr->scale.length * (matprops_ptr->scale.gravity)),
                xyh_vmax[2] * (matprops_ptr->scale.height),
                
                //x,y,v,h at location of hmax
                xyv_hmax[0] * matprops_ptr->scale.length,
                xyv_hmax[1] * matprops_ptr->scale.length,
                xyv_hmax[2] * sqrt(matprops_ptr->scale.length * (matprops_ptr->scale.gravity)),
                hmax * (matprops_ptr->scale.height));
        
        fclose(fp);
    }
    return;
}
int main(int argc, char* argv[])
{
  std::chrono::time_point<std::chrono::high_resolution_clock> tStart;
  std::chrono::time_point<std::chrono::high_resolution_clock> tStop;
  typedef std::chrono::duration<int,std::milli> millisecs_t ;

  int numprocs, rank, edge, pixel_count, start, end;
  double max_values_sq;
  Uint32 max_iter;
  
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  if(numprocs <= 1)
  {
    std::cerr << argv[0] << ": error: requires at least two MPI processes\n";
    return 1;
  }
  
  max_values_sq = 4.0;
  max_iter = 5000;

  edge = (MAX_X * MAX_Y) / (numprocs - 1);

  if(rank > 0)
  {
    int tile = rank - 1;

    Uint32* pixels;

    start = tile * edge;
    end = (tile == numprocs - 2) ? MAX_X * MAX_Y : (tile + 1) * edge;
    pixel_count = end - start;

    pixels = (Uint32*) malloc(pixel_count * sizeof(Uint32));
    calc_lines(start, end, pixels, max_values_sq, max_iter);

    MPI_Send((void*)pixels, pixel_count, MPI_INT, 0, 0, MPI_COMM_WORLD);
    free(pixels);
  }
  else /* rank == 0 */
  {

    int tile, recv_count = (edge + 1);
    char title[100];

    Uint32* field = (Uint32*) malloc(MAX_X * MAX_Y * sizeof(Uint32));
    Uint32* fieldpos;

    SDL_Surface* sdlSurface;
    SDL_Event event;
        
    MPI_Status status;

    tStart = std::chrono::high_resolution_clock::now();
    for(tile = 1; tile < numprocs; tile++)
    {
      start = (tile - 1) * edge;
      end = (tile == numprocs - 1) ? MAX_X * MAX_Y : tile * edge;

      pixel_count = end - start;
      recv_count = pixel_count;

      fieldpos = field+start;

      MPI_Recv(fieldpos, recv_count, MPI_INT, tile, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    }
    tStop = std::chrono::high_resolution_clock::now();
    millisecs_t duration( std::chrono::duration_cast<millisecs_t>(tStop-tStart) ) ;
    long elapsed = duration.count();
    
    SDL_Init(SDL_INIT_EVERYTHING);

    sdlSurface = SDL_SetVideoMode(MAX_X, MAX_Y, 32, SDL_HWSURFACE | SDL_DOUBLEBUF);
    
    std::stringstream ss;
    ss << argv[0] << " " 
    << numprocs << " processes "
    << elapsed*1.e-3 << " sec."
    << "\n";

    SDL_WM_SetCaption(ss.str().c_str(), title);
    std::cout << ss.str().c_str() << "\n";

    draw(sdlSurface, field);

    SDL_Flip(sdlSurface);
   
    do {
      SDL_Delay(50);
      SDL_PollEvent(&event);
    } while( event.type != SDL_QUIT && event.type != SDL_KEYDOWN );
        
    SDL_FreeSurface(sdlSurface);
    SDL_Quit();

    free(field);
  }

  MPI_Finalize();

  return 0;
}
Example #11
0
void ghost_communicator(GhostCommunicator *gc)
{
    MPI_Status status;
    int n, n2;
    int data_parts = gc->data_parts;

    GHOST_TRACE(fprintf(stderr, "%d: ghost_comm %p, data_parts %d\n", this_node, gc, data_parts));

    for (n = 0; n < gc->num; n++) {
        GhostCommunication *gcn = &gc->comm[n];
        int comm_type = gcn->type & GHOST_JOBMASK;
        int prefetch  = gcn->type & GHOST_PREFETCH;
        int poststore = gcn->type & GHOST_PSTSTORE;
        int node      = gcn->node;

        GHOST_TRACE(fprintf(stderr, "%d: ghost_comm round %d, job %x\n", this_node, n, gc->comm[n].type));
        GHOST_TRACE(fprintf(stderr, "%d: ghost_comm shift %f %f %f\n",this_node, gc->comm[n].shift[0], gc->comm[n].shift[1], gc->comm[n].shift[2]));
        if (comm_type == GHOST_LOCL)
            cell_cell_transfer(gcn, data_parts);
        else {
            /* prepare send buffer if necessary */
            if (is_send_op(comm_type, node)) {
                /* ok, we send this step, prepare send buffer if not yet done */
                if (!prefetch)
                    prepare_send_buffer(gcn, data_parts);
                else {
                    GHOST_TRACE(fprintf(stderr, "%d: ghost_comm using prefetched data for operation %d, sending to %d\n", this_node, n, node));
#ifdef ADDITIONAL_CHECKS
                    if (n_s_buffer != calc_transmit_size(gcn, data_parts)) {
                        fprintf(stderr, "%d: ghost_comm transmission size and current size of cells to transmit do not match\n", this_node);
                        errexit();
                    }
#endif
                }
            }
            else {
                /* we do not send this time, let's look for a prefetch */
                if (prefetch) {
                    /* find next action where we send and which has PREFETCH set */
                    for (n2 = n+1; n2 < gc->num; n2++) {
                        GhostCommunication *gcn2 = &gc->comm[n2];
                        int comm_type2 = gcn2->type & GHOST_JOBMASK;
                        int prefetch2  = gcn2->type & GHOST_PREFETCH;
                        int node2      = gcn2->node;
                        if (is_send_op(comm_type2, node2) && prefetch2) {
                            GHOST_TRACE(fprintf(stderr, "%d: ghost_comm prefetch operation %d, is send/bcast to/from %d\n", this_node, n2, node2));
                            prepare_send_buffer(gcn2, data_parts);
                            break;
                        }
                    }
                }
            }

            /* recv buffer for recv and multinode operations to this node */
            if (is_recv_op(comm_type, node))
                prepare_recv_buffer(gcn, data_parts);

            /* transfer data */
            switch (comm_type) {
            case GHOST_RECV:
                GHOST_TRACE(fprintf(stderr, "%d: ghost_comm receive from %d (%d bytes)\n", this_node, node, n_r_buffer));
                MPI_Recv(r_buffer, n_r_buffer, MPI_BYTE, node, REQ_GHOST_SEND, MPI_COMM_WORLD, &status);
                break;
            case GHOST_SEND:
                GHOST_TRACE(fprintf(stderr, "%d: ghost_comm send to %d (%d bytes)\n", this_node, node, n_s_buffer));
                MPI_Send(s_buffer, n_s_buffer, MPI_BYTE, node, REQ_GHOST_SEND, MPI_COMM_WORLD);
                break;
            case GHOST_BCST:
                GHOST_TRACE(fprintf(stderr, "%d: ghost_comm bcast from %d (%d bytes)\n", this_node, node,
                                    (node == this_node) ? n_s_buffer : n_r_buffer));
                if (node == this_node)
                    MPI_Bcast(s_buffer, n_s_buffer, MPI_BYTE, node, MPI_COMM_WORLD);
                else
                    MPI_Bcast(r_buffer, n_r_buffer, MPI_BYTE, node, MPI_COMM_WORLD);
                break;
            case GHOST_RDCE:
                GHOST_TRACE(fprintf(stderr, "%d: ghost_comm reduce to %d (%d bytes)\n", this_node, node, n_s_buffer));
                if (node == this_node)
                    MPI_Reduce(s_buffer, r_buffer, n_s_buffer, MPI_BYTE, MPI_FORCES_SUM, node, MPI_COMM_WORLD);
                else
                    MPI_Reduce(s_buffer, NULL, n_s_buffer, MPI_BYTE, MPI_FORCES_SUM, node, MPI_COMM_WORLD);
                break;
            }
            GHOST_TRACE(MPI_Barrier(MPI_COMM_WORLD));
            GHOST_TRACE(fprintf(stderr, "%d: ghost_comm done\n", this_node));

            /* recv op; write back data directly, if no PSTSTORE delay is requested. */
            if (is_recv_op(comm_type, node)) {
                if (!poststore) {
                    /* forces have to be added, the rest overwritten. Exception is RDCE, where the addition
                       is integrated into the communication. */
                    if (data_parts == GHOSTTRANS_FORCE && comm_type != GHOST_RDCE)
                        add_forces_from_recv_buffer(gcn);
                    else
                        put_recv_buffer(gcn, data_parts);
                }
                else {
                    GHOST_TRACE(fprintf(stderr, "%d: ghost_comm delaying operation %d, recv from %d\n", this_node, n, node));
                }
            }
            else {
                /* send op; write back delayed data from last recv, when this was a prefetch send. */
                if (poststore) {
                    /* find previous action where we recv and which has PSTSTORE set */
                    for (n2 = n-1; n2 >= 0; n2--) {
                        GhostCommunication *gcn2 = &gc->comm[n2];
                        int comm_type2 = gcn2->type & GHOST_JOBMASK;
                        int poststore2 = gcn2->type & GHOST_PSTSTORE;
                        int node2      = gcn2->node;
                        if (is_recv_op(comm_type2, node2) && poststore2) {
                            GHOST_TRACE(fprintf(stderr, "%d: ghost_comm storing delayed recv, operation %d, from %d\n", this_node, n2, node2));
#ifdef ADDITIONAL_CHECKS
                            if (n_r_buffer != calc_transmit_size(gcn2, data_parts)) {
                                fprintf(stderr, "%d: ghost_comm transmission size and current size of cells to transmit do not match\n", this_node);
                                errexit();
                            }
#endif
                            /* as above */
                            if (data_parts == GHOSTTRANS_FORCE && comm_type != GHOST_RDCE)
                                add_forces_from_recv_buffer(gcn2);
                            else
                                put_recv_buffer(gcn2, data_parts);
                            break;
                        }
                    }
                }
            }
        }
    }
}
int main(int argc, char* argv[]) {
#ifdef BENCHMARKING
    benchmark(argc, argv);
#else
    // mpi setup
    int numProcs;
    int rank, flag;
    int done = 0;
    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    // create a buffer for both worker and controller
    static double buffer[BUFFER_SIZE];
    unsigned int niter = argc > 1 ? atoi(argv[1]) : NITER;

    // Setting up the PSF (statically)
    int psfWidth, psfHeight;
    double* psf = ImageQueue::getPsf(&psfWidth, &psfHeight);

    // ---------- CONTROLLER NODE ---------- //
    if (rank == 0) {
        // Set up producer
        ImageQueue images(buffer, BUFFER_SIZE, "../images", numProcs);

        // Print out some details
        int numImages = images.remaining();
        FPRINT("Starting %d iteration(s) on %d image(s)", niter, numImages);
        PerfTimer mainTimer;
        mainTimer.begin();

        int toSend = (unsigned int)numProcs < images.remaining() ? numProcs : images.remaining();
        for (int i = 0; i < toSend; i++) {
            images.pop(i);
            MPI_Send(buffer, BUFFER_SIZE, MPI_DOUBLE, i, IMG, MPI_COMM_WORLD);
        }

        while (images.remaining() > 0) {
            for (int i = 0; i < numProcs; i++) {

                // If an image is received then save it and send the next one
                MPI_Iprobe(i, IMG, MPI_COMM_WORLD, &flag, &status);
                if (flag) {
                    MPI_Recv(buffer, BUFFER_SIZE, MPI_DOUBLE, i, IMG, MPI_COMM_WORLD, &status);
                    images.save(i);
                    images.pop(i);
                    MPI_Send(buffer, BUFFER_SIZE, MPI_DOUBLE, i, IMG, MPI_COMM_WORLD);
                }
            }
        }

        for (int i = 0; i < numProcs; i++) {
            MPI_Send(&done, 1, MPI_INT, i, END, MPI_COMM_WORLD);
        }
        FPRINT("Finished %d image(s) in %f seconds", numImages, mainTimer.getElapsed());
    }

    // ---------- WORKER NODE ---------- //
    else { // worker thread
        // Set up consumer
        DeconvFilter filter(WIDTH, HEIGHT, niter, psf, psfWidth, psfHeight, buffer);
        bool running = true;
        PRINT("Worker thread initialised.");

        while (running) {
            MPI_Iprobe(0, IMG, MPI_COMM_WORLD, &flag, &status);
            if (flag) { // New image
                MPI_Recv(buffer, BUFFER_SIZE, MPI_DOUBLE, 0, IMG, MPI_COMM_WORLD, &status);
                filter.process();
                MPI_Send(buffer, BUFFER_SIZE, MPI_DOUBLE, 0, IMG, MPI_COMM_WORLD);
            }

            MPI_Iprobe(0, END, MPI_COMM_WORLD, &flag, &status);
            if (flag) { // Execution finished
                MPI_Recv(&done, 1, MPI_INT, 0, END, MPI_COMM_WORLD, &status);
                running = false;
            }
        }
        PRINT("Worker thread finished.");
    }

    MPI_Finalize();
#endif
    return 0;
}
int main(int argc, char **argv) {
  MPI_Init(&argc, &argv);

  double a[ISIZE][JSIZE];

  int myRank;
  int numProcesses;

  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
  MPI_Comm_size(MPI_COMM_WORLD, &numProcesses);

  if (myRank == 0) {
    // Initialization
    for (int i = 0; i < ISIZE; ++i) {
      for (int j = 0; j < JSIZE; ++j) {
        a[i][j] = 10 * i + j;
      }
    }
  }


  double timeElapsedMs;
  MEASURE_TIME_MS_BEGIN(timeElapsedMs);

  // Decide which process is responsible for which part of data
  int segmentsLength[numProcesses];
  int segmentsStart[numProcesses];
  int segmentsEnd[numProcesses];

  int base = JSIZE / numProcesses;
  int rem = JSIZE % numProcesses;
  for (int p = 0; p < numProcesses; ++p) {
    if (p < rem) {
      segmentsStart[p] = p * (base + 1);
      segmentsEnd[p] = segmentsStart[p] + base;
    } else {
      segmentsStart[p] = rem * (base + 1) + (p - rem) * base;
      segmentsEnd[p] = segmentsStart[p] + base - 1;
    }
    segmentsLength[p] = segmentsEnd[p] - segmentsStart[p] + 1;
  }

  double dataSplitted[JSIZE][ISIZE];
  if (myRank == 0) {
    for (int j0 = 0; j0 < JSIZE; ++j0) {
      for (int i = 0, j = j0; i < ISIZE; ++i, j = (j + JSIZE - 1) % JSIZE) {
        dataSplitted[j0][i] = a[i][j];
      }
    }
  }

  if (myRank == 0) {
    // Send splitted data to processes
    for (int p = 1; p < numProcesses; p++) {
      MPI_Send(dataSplitted[segmentsStart[p]], ISIZE * segmentsLength[p], MPI_DOUBLE, p, 
               TAG_SEND_DATA, MPI_COMM_WORLD);
    }
  } else {
    // Recevive data from master process
    MPI_Status status;
    MPI_Recv(dataSplitted[segmentsStart[myRank]], ISIZE * segmentsLength[myRank], MPI_DOUBLE,
             0, TAG_SEND_DATA, MPI_COMM_WORLD, &status);
  }

  // Compute partial results
  for (int j0 = segmentsStart[myRank]; j0 <= segmentsEnd[myRank]; ++j0) {
    for (int i = 0, j = j0; i < ISIZE; ++i, j = (j + JSIZE - 1) % JSIZE) {
      if ((j < JSIZE - 1) && (i > 0)) {
        for (int k = 0; k < KSIZE; ++k) {
          dataSplitted[j0][i] = sin(0.01 * dataSplitted[j0][i - 1]); 
        }
      }
    }
  }

  if (myRank > 0) {
    // Send data back to the master
    MPI_Send(dataSplitted[segmentsStart[myRank]], ISIZE * segmentsLength[myRank], MPI_DOUBLE,
             0, TAG_COLLECT_RESULTS, MPI_COMM_WORLD);
  } else {
    // Collect data from the processes
    for (int p = 1; p < numProcesses; ++p) {
      MPI_Status status;
      MPI_Recv(dataSplitted[segmentsStart[p]], ISIZE * segmentsLength[p], MPI_DOUBLE,
               p, TAG_COLLECT_RESULTS, MPI_COMM_WORLD, &status);
    }
  }

  if (myRank == 0) {
    // Put results back into matrix, output time and print results
  
    for (int j0 = 0; j0 < JSIZE; ++j0) {
      for (int i = 0, j = j0; i < JSIZE; ++i, j = (j + JSIZE - 1) % JSIZE) {
        a[i][j] = dataSplitted[j0][i]; 
      }
    }

   
    MEASURE_TIME_MS_END(timeElapsedMs);
    printf("%.6lf\n", timeElapsedMs);

    FILE *ff = fopen("parallel_2.out", "w");
    for (int i = 0; i < ISIZE; ++i) {
      for (int j = 0; j < JSIZE; ++j) {
        fprintf(ff, "%f ", a[i][j]);
      }
      fprintf(ff, "\n");
    }

    fclose(ff);
  }


  //printf("Process %d, segmentsBegin: %d, segmentsEnd: %d\n", myRank, segmentsStart[myRank], 
//         segmentsEnd[myRank]);

  MPI_Finalize();
  return 0;
}
Example #14
0
File: main.c Project: mhelal/mmDST
/* ============================================================================
	function MainProcess:
		seqNum: Number of sequences in string sequences (ex. 3)
		sequences: holds the sequences. (ex. [GTGCAACGTACT])
		seqLen: Array of the length of each sequence in sequences. (ex. 5,4,3)
		======= which means that we have three sequences GTGCA, ACGT, and ACT.
		stype: Scoring type (1: linear score, 2: PAM250 if protein, 3: BLOSUM if protein)
		partitionSize: Partion Size
============================================================================== */
void MainProcess (MOATypeDimn seqNum, char * * sequences, char * * seqName, MOATypeShape * seqLen, int stype, long partitionSize) {
    ProcessData * pData = NULL;
    ScoringData * sData = NULL;
    WavesData * wData = NULL;
    MOATypeDimn k;
    MOATypeInd i;
    int ret, startflag;
    struct rusage usageRec;
    double utime, stime;
    MPI_Status status;
    double t_start, t_finish;
    char command[2000];

#ifndef NDEBUG
    char msg[SHORT_MESSAGE_SIZE];
    int dbglevel = 0;
    MOATypeInd j;
#endif

    t_start = MPI_Wtime();
    /* print the input arguments ============================================*/
    //PrintSequencies (0, seqNum, sequences, seqLen);
#ifndef NDEBUG
    sprintf(msg, ">>>>MainProcess: Scoring Type: %d\n>>>>Partition Size: %ld\n", stype, partitionSize);	
    mprintf(dbglevel, msg, 1);
#endif

    /* Initialize Process Memory pData (function located in partitioning.c*/

    ret = initProcessMemory(&pData, &sData, &wData, seqNum, seqLen, sequences, seqName, stype, partitionSize);  
    if (ret != 0) {
        mprintf (0, ">>>>MainProcess: Error Initializing Process Data, Exiting\n", 1);
        fflush (stdout);
        return;
    }
    /* if restore previouse run read check point data here, do not calculate waves */
    pData->OCout = NULL;
    pData->OCin = NULL;
    if (Mode != Distributed) {

        /* Construct MOA record */
        pData->msaAlgn =  NULL;
        createMOAStruct(&pData->msaAlgn);
        if (createMOA(seqLen, seqNum, pData->msaAlgn, 0, 0) < 0)
            return;		
        wData->wavesTotal = 1;
        wData->AllpartsInWave =  mmalloc((MOATypeInd) sizeof *wData->AllpartsInWave);
        wData->AllpartsInWave[0] = 1;
        pData->waveNo =  0;
        pData->partNo = 0;
        pData->partitionsCount = 1;
        /*pData->OCout = mmalloc((MOATypeInd) sizeof *(pData->OCout));
        if (pData->OCout == NULL) {
            mprintf(1, "Couldn't create memory for OCout while adding an OC. Exiting.\n", 3);
            printf("Couldn't create memory for OCout while . Exiting.\n",);
            return;
        }
        pData->OCout[0].wavesOC = 0;
        pData->OCout[0].WOCO = NULL;
        */
#ifndef NDEBUG
        sprintf (msg, "[%d]>ScoreCompThread[%ld]: Will call ComputePartitionScores\n", myProcid, pData->computedPartitions);
        mprintf (dbglevel, msg, 1);
#endif
            /* Compute Scored for Current Partition*/
        if (Algorithm == DP) 
            DPComputeScores (pData, sData, wData);
        else if (Algorithm == SP)
            DPComputeScores (pData, sData, wData);
        /* Print elements ======================================================= */
        //printMOA_scr(pData->msaAlgn, 0);
        /* Print Indexes ========================================================*/
        //printMOA_scr(pData->msaAlgn, 1);

        pData->computedPartitions ++;
        checkPoint (pData, sData);		
    }
    else {
         if (RestoreFlag == 1) {
            /* restore data */
            restoreCheckPoint (pData, wData);
            pData->globalWaveNo =  pData->waveNo;
        } else {
            pData->partNo = 0;
            pData->waveNo = 0;
            if (myProcid == 0) {
                printf ("[%d] Calculating waves and partitions .... ", myProcid);
                calcWaves (pData, wData);
                currNow = getTime();
                printf("[%d] Done Calculating waves and partitions time (%d, %d, %d, %d)\n", myProcid, currNow->tm_yday, currNow->tm_hour, currNow->tm_min, currNow->tm_sec);
                fflush(stdout);
                if( checkPointWavesCalculations (pData, wData) == 0) {
                    startflag = 1;
                    for (i=1; i<ClusterSize; i++)
                        MPI_Send(&startflag, 1, MPI_INT, i, 0, MOAMSA_COMM_WORLD);
                }
                else {
                    printf ("[%d]Couldn't write Waves calculations, Exiting\n", myProcid);
                    return;
                }
            }
            else {
                //printf ("[%d] waiting for start flag\n", myProcid);
                MPI_Recv(&startflag, 1, MPI_INT, 0, 0, MOAMSA_COMM_WORLD, &status);
                //printf ("[%d] received start flag = %d\n", myProcid, startflag);
                printf ("[%d] Reading waves and partitions .... ", myProcid);
                if (restoreWavesCalculations(pData, wData) != 0) {
                    printf ("[%d]Couldn't read Waves calculations, Exiting\n", myProcid);
                    return;
                }
                printf ("done.\n");
                fflush(stdout);
            }
        }	

#ifndef NDEBUG
        sprintf(msg, "[%d]>MainProcess: Current Wave: %ld - Current Partition: %ld - Total Partitions in Process: %ld\n", myProcid, pData->waveNo, pData->partNo, pData->partitionsCount);	
        mprintf(dbglevel, msg, 1);
#endif
        ScoreCompThread (pData, sData, wData);
    }
    if (myProcid == 0) {
        t_finish = MPI_Wtime();
        /* Getting Process Resources Usage ===================== */
        ret = getrusage(RUSAGE_SELF, &usageRec);
        if (ret == 0) {
            //printf ("[%d]Resources Usage: UTime %ld, STime %ld, Mem %ld, Virt %ld\n", myProcid, usageRec.ru_utime.tv_sec, usageRec.ru_stime.tv_sec, usageRec.ru_maxrss, usageRec.ru_ixrss);
            utime = (double) usageRec.ru_utime.tv_sec + 1.e-6 * (double) usageRec.ru_utime.tv_usec;
            stime = (double) usageRec.ru_stime.tv_sec + 1.e-6 * (double) usageRec.ru_stime.tv_usec;	
            //printf ("[%d]Resources Usage: UTime %f, STime %f\n", myProcid, utime, stime);
        }
        else
            printf ("[%d]Failed to retrieve Process Resources Usage, errno %d\n", myProcid, errno);

        //struct mallinfo info;
        //info = mallinfo();

        //printf("[%d] STime\tUTime\theap\tMemory\t\n",myProcid);
        //printf("[%d] %f\t%f\t%d\t%d\n", myProcid, stime, utime, info.arena, info.usmblks + info.uordblks);

        printf("STime\tUTime\n");
        printf("%f\t%f\n", stime, utime);
        printf ("Elsp-time: %f\n", t_finish - t_start);
        fflush(stdout);
        sprintf (command, "prstat 1 1 > /export/home/mhelal1/thesis/exp/run/prstatus/prst_%s", outputfilename);
        i = system (command);
    }
    /* Free allocated memory and exit routine ===================== */

    freeProcessMemory (&pData, &sData, &wData);
}
Example #15
0
int main (int argc, char **argv)
{


  MPI_Init (&argc, &argv);
    
  GetPot cl (argc, argv);

  if (cl.search (2, "-h", "--help"))
    {
      std::cerr << help_text << std::endl;
      return 0;
    }
  
  const double a = cl.follow (double (0.0), "-a");
  const double b = cl.follow (double (1.0), "-b");
  const unsigned int nnodes = cl.follow (100, 2, "-n", "--nnodes");
  const unsigned int nel = nnodes - 1;
  const std::string diffusion = cl.follow ("1.0", 2, "-d", "--diffusion");
  const std::string forcing = cl.follow ("1.0", 2, "-f", "--forcing");
  const double L = b - a;

  constexpr double tol = 1e-6;
  constexpr unsigned int maxit = 100;
  constexpr unsigned int overlap = 100;

  MPI_Status status;
  int mpi_size, mpi_rank, tag;
  MPI_Comm_size (MPI_COMM_WORLD, &mpi_size);
  MPI_Comm_rank (MPI_COMM_WORLD, &mpi_rank);
  
  const double L_loc = L / double(mpi_size);
  const double h = L_loc / ceil (double(nel) / double(mpi_size));

  double a_loc  = .0;
  double lval   = .0;
  double b_loc  = .0;
  double rval   = .0;
  double buffer = .0;
  
  unsigned int nel_loc = 0;
  unsigned int ndof_loc = 1;
  fem_1d<double> *subproblems;
  
  coeff<double> a_coeff (diffusion);
  coeff<double> f_coeff (forcing);


  a_loc = a + mpi_rank * L_loc;
  b_loc = a_loc + L_loc;
  nel_loc = ceil (double(nel) / double(mpi_size));
  if (mpi_rank > 0)
    {
      a_loc -= overlap * h;
      nel_loc += overlap;
    }
  if (mpi_rank < mpi_size - 1)
    {
      b_loc += overlap * h;
      nel_loc += overlap;
    }
  ndof_loc = nel_loc + 1;
  subproblems = new fem_1d<double>
    (new mesh<double> (a_loc, b_loc, ndof_loc));
      
  subproblems->set_diffusion_coefficient (a_coeff);
  subproblems->set_source_coefficient (f_coeff);
  subproblems->assemble ();
  subproblems->set_dirichlet (fem_1d<double>::left_boundary, 0.0);
  subproblems->set_dirichlet (fem_1d<double>::right_boundary, 0.0);
  subproblems->solve ();

  for (unsigned int it = 0; it < maxit; ++it)
    {

      // With the following implementation
      // communication will occur sequentailly
      // left to right first then right to left
      
      // Receive from left neighbour
      if (mpi_rank > 0)
        {

          std::cerr << "rank " << mpi_rank
                    << " receiving lval from rank "
                    << mpi_rank - 1
                    << std::endl;

          MPI_Recv (&buffer, 1, MPI_DOUBLE, mpi_rank - 1, MPI_ANY_TAG,
                    MPI_COMM_WORLD, &status);

          std::cerr << "rank " << mpi_rank
                    << " received lval from rank "
                    << mpi_rank - 1
                    << std::endl;
            
          lval = buffer;
        }

      tag = 10*mpi_rank;
      // Send to right neighbour
      if (mpi_rank < mpi_size - 1)
        {
          buffer = subproblems->result ()
            [ndof_loc - 1 - 2*overlap];

          std::cerr << "rank " << mpi_rank
                    << " sending lval to rank "
                    << mpi_rank + 1
                    << std::endl;

          MPI_Send (&buffer, 1, MPI_DOUBLE, mpi_rank + 1, tag,
                    MPI_COMM_WORLD);

          std::cerr << "rank " << mpi_rank
                    << " sent lval to rank "
                    << mpi_rank + 1
                    << std::endl;
        }


      // Receive from right neighbour
      if (mpi_rank < mpi_size - 1)
        {
          std::cerr << "rank " << mpi_rank
                    << " receiving rval from rank "
                    << mpi_rank + 1
                    << std::endl;
          
          MPI_Recv (&buffer, 1, MPI_DOUBLE, mpi_rank + 1, MPI_ANY_TAG,
                    MPI_COMM_WORLD, &status);

          std::cerr << "rank " << mpi_rank
                    << " received rval from rank "
                    << mpi_rank + 1
                    << std::endl;

          rval = buffer;
        }

      tag = 10*mpi_rank + 1;
      // Send to right neighbour
      if (mpi_rank > 0)
        {
          buffer = subproblems->result () [2*overlap];

          std::cerr << "rank " << mpi_rank
                    << " sending rval to rank "
                    << mpi_rank - 1
                    << std::endl;

          MPI_Send (&buffer, 1, MPI_DOUBLE, mpi_rank - 1, tag,
                    MPI_COMM_WORLD);

          std::cerr << "rank " << mpi_rank
                    << " sent rval to rank "
                    << mpi_rank - 1
                    << std::endl;
        }


      subproblems->set_dirichlet
        (fem_1d<double>::left_boundary, lval);
      
      subproblems->set_dirichlet
        (fem_1d<double>::right_boundary, rval);
      
      subproblems->solve ();

    }

  for (int rank = 0; rank < mpi_size; ++rank)
    {
      if (rank == mpi_rank)
        for (unsigned int ii = 0; ii < ndof_loc; ++ii)
          std::cout << subproblems->m->nodes[ii] << " "
                    << subproblems->result ()(ii, 0)
                    << std::endl;
      MPI_Barrier (MPI_COMM_WORLD);
    }

  MPI_Finalize ();
  return 0;

};
Example #16
0
int main(int argc, char *argv[])
{
  long N=20, M=30;      // number of cells NxM
  int n=2,  m=3;        // number of blocks nxm 
  int tpi=16, tpj=18;   // test pressure coordinates
  int tai=7, taj=9;     // test average coordinates
  int i, j, I, J;       // local and global i,j
  int myi, myj;         // my i,j in neighbor map
  int bi, bj;           // block size in y and x direction
  int numprocs, myid;   // number of processors and my rank id
  double **P, **A;      // 2D array of pressures and averages
  int **B;              // 2D array with map of neighbors

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myid);

  // get command line arguments if any
  if (argc > 1) {
    if (argc != 5) {
      if (myid==0) {
        fprintf(stderr, "usage: prog [N M n m]\n");
        fprintf(stderr, "Parameters:\n");
        fprintf(stderr, "\tN: number of rows or cells in y direction. Default: %ld\n", N);
        fprintf(stderr, "\tM: number of columns or cells in x direction. Default: %ld\n", M);
        fprintf(stderr, "\tn: number of blocks in y direction. Default: %d\n", n);
        fprintf(stderr, "\tm: number of blocks in x direction. Default %d\n", m);
      }
      MPI_Finalize();
      exit(3);
    } 
    N = atoi(argv[1]);
    M = atoi(argv[2]);
    n = atoi(argv[3]);
    m = atoi(argv[4]);
  }

  bi = N/n;
  bj = M/m;

  // start message
  if (myid==0) {
    printf("Terapressure v0.1\n");
    printf("=================\n");
    printf("Number of cells: %lu (%lu x %lu)\n", N*M, N, M);
    printf("Number of blocks: %d (%d x %d)\n", n*m, n, m);
    printf("Number of processors %d\n", numprocs);
    printf("Block size: (%d x %d)\n", bi, bj);
  }
  
  // validate parameters   
  if (N % n != 0 || M % m != 0) {
    if(myid==0) 
      fprintf(stderr,"Number of blocks in x or y axis do not fit.\n"); 
    MPI_Finalize();
    exit(1);
  }  
  if (numprocs != n*m) {
    if (myid==0) 
      fprintf(stderr,"Number of processors must be the same as number of blocks: %d\n", n*m);
    MPI_Finalize();
    exit(2);
  }

  double t = MPI_Wtime();

  // memory allocation
  // stack allocation is simple but limited in size
  // double   P[bi][bj];
  // double   A[bi][bj];
  // int     B[n][m];      
  
  // heap allocation
  P = malloc(sizeof(double*) * bi); 
  A = malloc(sizeof(double*) * bi); 
  for (i=0; i < bi; i++) {
    P[i] = malloc(sizeof(double) * bj);
    A[i] = malloc(sizeof(double) * bj);
  }  
  B = malloc(sizeof(int*) * n); 
  for (i=0; i < n; i++) {
    B[i] = malloc(sizeof(int) * m);
  }
  
  // domain decomposition
  int rank = 0;    
  //printf("Neighbors map:\n");
  for (i=0; i < n; i++) {
    for (j=0; j < m; j++) {
      if (rank == myid) {
        myi = i; 
        myj = j;
      }
      B[i][j] = rank++;
      //printf ("%3d ",  W[i][j]);
    }
    //printf ("\n");
  }
  //printf("%d: my i,j in neighbor map: %d,%d\n", myid, myi, myj);

  // compute pressures
  // printf("%d: My pressures:\n", myid);
  double pressure = -1;

  for (i=0; i < bi; i++) {
    I = myi * bi + i;
    for (j=0; j < bj; j++) {
      J = myj * bj + j;
      if (I==0 || I==N-1 || J==0 || J==M-1)
        P[i][j] = 0;
      else
        P[i][j] = (double)(I+J) * (double)(I*J);
      //printf ("L(%d,%d) G(%d,%d): %.2f\t",i,j,I,J, P[i][j]);
      if (I == tpi && J == tpj) 
        pressure = P[i][j];
    }
    //printf ("\n");
  }

  // average pressure
  int neighbor;
  double center, left, top, right, bottom;  
  double average = -1;

  for (i=0; i < bi; i++) {
    I = myi * bi + i;
    for (j=0; j < bj; j++) {
      J = myj * bj + j;
      if ( I==0 || I==N-1 || J==0 || J==M-1 )
        continue;
    
      center = P[i][j];
      
      // top cell
      if (i==0) {
        neighbor = B[myi-1][myj]; 
        MPI_Send(&center, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD);
        MPI_Recv(&top, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0);      
        //printf("%2d: send to   %d (%d,%d): %.2f\n", myid, neighbor,I,J,center);
        //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I-1,J,top);
      } else {
        top = P[i-1][j];
      }
      // bottom cell
      if (i==bi-1) {
        neighbor = B[myi+1][myj];
        MPI_Send(&center, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD);
        MPI_Recv(&bottom, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0);
        //printf("%2d: send to   %d (%d,%d): %.2f\n", myid, neighbor,I,J,center);
        //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I+1,J,bottom);
      } else {
        bottom = P[i+1][j];
      }
      // left cell
      if (j==0) {
        neighbor = B[myi][myj-1];
        MPI_Send(&center, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD);
        MPI_Recv(&left, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0);
        //printf("%2d: send to   %d (%d,%d): %.2f\n", myid, neighbor,I,J,center);
        //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I,J-1,left);
      } else {
        left = P[i][j-1];
      }
      // right cell
      if (j==bj-1) {
        neighbor = B[myi][myj+1];
        MPI_Send(&center, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD);
        MPI_Recv(&right, 1, MPI_DOUBLE, neighbor, 0, MPI_COMM_WORLD, 0);
        //printf("%2d: send to   %d (%d,%d): %.2f\n", myid, neighbor,I,J,center);
        //printf("%2d: recv from %d (%d,%d): %.2f\n", myid, neighbor,I,J+1,right);
      } else {
        right = P[i][j+1];
      }
  
      A[i][j] = ( center + left + top + right + bottom ) / 5;    
      
      //printf ("L(%d,%d) G(%d,%d): %.2f\t",i,j,I,J, A[i][j]);       
      
      if (I==tai && J==taj) 
        average = A[i][j];
    }
    //printf ("\n");
  }

  // cleanup memory  
  for (i=0; i < bi; i++) {
    free(P[i]);
    free(A[i]);
  }
  free(P); 
  free(A);
  for (i=0; i < n; i++) {
    free(B[i]);
  }
  free(B);

  // report result
  //printf("Preasure at (16,18): %.2f\n", P[16][18]);
  //printf("Avg at (7,9): %.2f\n", A[7][9]);
 
  if (pressure > -1) 
    printf("Preasure at (%2d,%2d): %.2f computed by processor %d\n",
      tpi, tpj, pressure, myid);
  if (average > -1)
    printf("Average  at (%2d,%2d): %.2f computed by processor %d\n",
      tai, taj, average, myid);

  MPI_Barrier(MPI_COMM_WORLD); 
  if (myid==0)
    printf("Time elapsed: %.2f seconds.\n", MPI_Wtime()-t);
 
  MPI_Finalize();

  return 0;
}
/* *************************** *
 *  Main computational kernel  *
 * *************************** */
int correlationKernel(int rank,
                      int size,
                      double* dataMatrixX,
                      double* dataMatrixY,
                      int columns,
                      int rows,
                      char *out_filename,
                      int distance_flag) {

    int local_check = 0, global_check = 0;
    int i = 0, j, taskNo;
    int err, count = 0;
    unsigned long long fair_chunk = 0, coeff_count = 0;
    unsigned int init_and_cleanup_loop_iter=0;
    unsigned long long cor_cur_size = 0;
    
    double start_time, end_time;

    // Variables needed by the Indexed Datatype
    MPI_Datatype coeff_index_dt;
    MPI_File fh;
    int *blocklens, *indices;

    MPI_Status stat;
    MPI_Comm comm = MPI_COMM_WORLD;

    // Master processor keeps track of tasks
    if (rank == 0) {

        // Make sure everything will work fine even if there are
        // less genes than available workers (there are size-1 workers
        // master does not count)
        if ( (size-1) > rows )
            init_and_cleanup_loop_iter = rows+1;
        else
            init_and_cleanup_loop_iter = size;

        // Start timer
        start_time = MPI_Wtime();

        // Send out initial tasks (remember you have size-1 workers, master does not count)
        for (i=1; i<init_and_cleanup_loop_iter; i++) {
            taskNo = i-1;
            err = MPI_Send(&taskNo, 1, MPI_INT, i, 0, comm);
        }        

        // Terminate any processes that were not working due to the fact
        // that the number of rows where less than the actual available workers
        for(i=init_and_cleanup_loop_iter; i < size; i++) {
            PROF(rank, "\nPROF_idle : Worker %d terminated due to insufficient work load", i);
            err = -1;
            err = MPI_Send(&err, 1, MPI_INT, i, 0, comm);
        }

        // Wait for workers to finish their work assignment and ask for more
        for (i=init_and_cleanup_loop_iter-1; i<rows; i++) {
            err = MPI_Recv(&taskNo, 1, MPI_INT, MPI_ANY_SOURCE, 0, comm, &stat);

            // Check taskNo to make sure everything is ok. Negative means there is problem
            // thus terminate gracefully all remaining working workers
            if ( taskNo < 0 ) {
                // Reduce by one because one worker is already terminated
                init_and_cleanup_loop_iter--;
                // Break and cleanup
                break;
            }

            // The sending processor is ready to work:
            // It's ID is in stat.MPI_SOURCE
            // Send it the current task (i)
            err = MPI_Send(&i, 1, MPI_INT, stat.MPI_SOURCE, 0, comm);
        }

        // Clean up processors
        for (i=1; i<init_and_cleanup_loop_iter; i++) {
            // All tasks complete - shutdown workers
            err = MPI_Recv(&taskNo, 1, MPI_INT, MPI_ANY_SOURCE, 0, comm, &stat);
            // If process failed then it will not be waiting to receive anything
            // We have to ignore the send because it will deadlock
            if ( taskNo < 0 )
                continue;
            err = -1;
            err = MPI_Send(&err, 1, MPI_INT, stat.MPI_SOURCE, 0, comm);
        }

        // Master is *always* OK
        local_check = 0;
        MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

        // Check failed, abort
        if ( global_check != 0 ) {
            return -1;
        }
        
        // Stop timer
        end_time = MPI_Wtime();
        PROF(rank, "\nPROF_comp (workers=%d) : Time taken by correlation coefficients computations : %g\n", size-1, end_time - start_time);

        // Start timer
        start_time = MPI_Wtime();

        // Master process must call MPI_File_set_view as well, it's a collective call
        // Open the file handler
        MPI_File_open(comm, out_filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);

        // Create the file view
        MPI_File_set_view(fh, 0, MPI_DOUBLE, MPI_DOUBLE, "native", MPI_INFO_NULL);

        // Write data to disk
        MPI_File_write_all(fh, &cor[0], 0, MPI_DOUBLE, &stat);

        // Stop timer
        end_time = MPI_Wtime();
        PROF(rank, "\nPROF_write (workers=%d) : Time taken for global write-file : %g\n",  size-1, end_time - start_time);

    } else {

        // Compute how many workers will share the work load
        // Two scenarios exist:
        // (1) more OR equal number of workers and rows exist
        // (2) more rows than workers
        if ( (size-1) > rows ) {
            // For this scenario each worker will get exaclty one work asssignment.
            // There is not going to be any other work so it only compute "rows" number
            // of coefficients
            fair_chunk = rows;
            cor_cur_size = fair_chunk;
        } else {
            // For this scenario we are going to allocate space equal to a fair
            // distribution of work assignments *plus* an extra amount of space to
            // cover any load imbalancing. This amount is expressed as a percentage
            // of the fair work distribution (see on top, 20% for now)

            // Plus 1 to round it up or just add some extra space, both are fine
            fair_chunk = (rows / (size-1)) + 1;
            DEBUG("fair_chunk %d \n", fair_chunk);

            // We can use "j" as temporary variable.
            // Plus 1 to avoid getting 0 from the multiplication.
            j = (fair_chunk * MEM_PERC) + 1;

            cor_cur_size = (fair_chunk + j) * rows;
            DEBUG("cor_cur_size %lld \n", cor_cur_size);
        }

        // Allocate memory
        DEBUG("cor_cur_size %lld \n", cor_cur_size);
        long long double_size = sizeof(double);
        DEBUG("malloc size %lld \n", (double_size * cor_cur_size));
        cor = (double *)malloc(double_size * cor_cur_size);

        blocklens = (int *)malloc(sizeof(int) * rows);
        indices = (int *)malloc(sizeof(int) * rows);

        mean_value_vectorX = (double *)malloc(sizeof(double) * rows);
        Sxx_vector = (double *)malloc(sizeof(double) * rows);
        mean_value_vectorY = (double *)malloc(sizeof(double) * rows);
        Syy_vector = (double *)malloc(sizeof(double) * rows);

        // Check that all memory is successfully allocated
        if ( ( cor == NULL ) || ( blocklens == NULL ) || ( indices == NULL ) || 
             ( mean_value_vectorX == NULL ) || ( Sxx_vector == NULL ) ||
             ( mean_value_vectorY == NULL ) || ( Syy_vector == NULL ) ) {
            ERR("**ERROR** : Memory allocation failed on worker process %d. Aborting.\n", rank);

            // Free allocated memory
            free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector);

            // Let the master process know its aborting in order to terminate
            // the rest of the working workers
            // We have to receive a work assignment first and then terminate
            // otherwise the master will deadlock trying to give work to this worker
            err = MPI_Recv(&taskNo, 1, MPI_INT, 0, 0, comm, &stat);
            taskNo = -1;
            err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm);

            // This worker failed
            local_check = 1;
            MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

            return -1;
        }

        // Compute necessary parameters for Pearson method
        // (this will transform the values of the input array to more meaningful data
        //  and save us from a lot of redundant computations)
        compute_parameters(dataMatrixX, dataMatrixY, rows, columns);

        // Main loop for workers. They get work from master, compute coefficients,
        // save them to their *local* vector and ask for more work
        for(;;) {
            // Get work
            err = 0;
            err = MPI_Recv(&taskNo, 1, MPI_INT, 0, 0, comm, &stat);

            // If received task is -1, function is terminated
            if ( taskNo == -1 )  break;

            // Check if there is enough memory to store the new coefficients, if not reallocate
            // the current memory and expand it by MEM_PERC of the approximated size
            if ( cor_cur_size < (coeff_count + rows) ) {
                PROF(0, "\n**WARNING** : Worker process %3d run out of memory and reallocates. Potential work imbalancing\n", rank);
                DEBUG("\n**WARNING** : Worker process %3d run out of memory and reallocates. Potential work imbalancing\n", rank);

                // Use j as temporary again. Add two (or any other value) to avoid 0.
                // (two is just a random value, you can put any value really...)
                j = (fair_chunk * MEM_PERC) + 2;
                cor_cur_size += (j * rows);

                // Reallocate and check
                cor = (double *)realloc(cor, sizeof(double) * cor_cur_size);
                if ( cor == NULL ) {
                    ERR("**ERROR** : Memory re-allocation failed on worker process %d. Aborting.\n", rank);

                    // Let the master process know its aborting in order to terminate
                    // the rest of the working workers
                    taskNo = -1;
                    err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm);

                    // This worker failed
                    local_check = 1;
                    MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

                    // Free all allocated memory
                    free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector);

                    return -1;
                }
            }

            // Compute the correlation coefficients
            if(dataMatrixY != NULL) {
              for (j=0; j < rows; j++) {
                cor[coeff_count] = pearson_XY(dataMatrixX, dataMatrixY, j, taskNo, columns);
                coeff_count++;
              }

            } else {
              for (j=0; j < rows; j++) {
                // Set main diagonal to 1
                if ( j == taskNo ) {
                  cor[coeff_count] = 1.0;
                  coeff_count++;
                  continue;
                }
                cor[coeff_count] = pearson(dataMatrixX, taskNo, j, columns);
                coeff_count++;
              }
            }

            // The value of blocklens[] represents the number of coefficients on each
            // row of the corellation array
            blocklens[count] = rows;

            // The value of indices[] represents the offset of each row in the data file
            indices[count] = (taskNo * rows);
            count++;

            // Give the master the taskID
            err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm);
        }

        // There are two possibilities
        //   (a) everything went well and all workers finished ok
        //   (b) some processes finished ok but one or more of the remaining working workers failed
        // To make sure all is well an all-reduce will be performed to sync all workers and guarantee success
        // before moving on to write the output file
        // This worker is OK
        local_check = 0;
        MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

        // Check failed
        if ( global_check != 0 ) {
            // Free all allocated memory
          free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector);
            return -1;
        }

        PROF(0, "\nPROF_stats (thread %3d) : Fair chunk of work : %d \t\t Allocated : %d \t\t Computed : %d\n",
                rank, fair_chunk, cor_cur_size, coeff_count);

        // If the distance_flag is set, then transform all correlation coefficients to distances
        if ( distance_flag == 1 ) {
            for(j=0; j < coeff_count; j++) {
                cor[j] = 1 - cor[j];
            }
        }

        // Create and commit the Indexed datatype *ONLY* if there are data available
        if ( coeff_count != 0 ) {
            MPI_Type_indexed(count, blocklens, indices, MPI_DOUBLE, &coeff_index_dt);
            MPI_Type_commit(&coeff_index_dt);
        }

        // Open the file handler
        MPI_File_open(comm, out_filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);

        // Create the file view
        if ( coeff_count != 0 ) {
            MPI_File_set_view(fh, 0, MPI_DOUBLE, coeff_index_dt, "native", MPI_INFO_NULL);
        } else {
            MPI_File_set_view(fh, 0, MPI_DOUBLE, MPI_DOUBLE, "native", MPI_INFO_NULL);
        }

        // Write data to disk
        // TODO coeff_count cannot be greater than max int (for use in the MPI_File_write_all call). 
        // A better fix should be possible, for now throw error.
        
        DEBUG("\ncoeff_count is %lld\n", coeff_count);
        DEBUG("\INT_MAX is %d\n", INT_MAX);
        if(coeff_count>INT_MAX)
        {
            ERR("**ERROR** : Could not run as the chunks of data are too large. Try running again with more MPI processes.\n");

            // Free allocated memory
            free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector);

            // Let the master process know its aborting in order to terminate
            // the rest of the working workers
            // We have to receive a work assignment first and then terminate
            // otherwise the master will deadlock trying to give work to this worker
            err = MPI_Recv(&taskNo, 1, MPI_INT, 0, 0, comm, &stat);
            taskNo = -1;
            err = MPI_Send(&taskNo, 1, MPI_INT, 0, 0, comm);

            // This worker failed
            local_check = 1;
            MPI_Allreduce(&local_check, &global_check, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

            return -1;
        }

        
        
        DEBUG("\nWriting %d to disk\n", coeff_count);

        MPI_File_write_all(fh, &cor[0], coeff_count, MPI_DOUBLE, &stat);

        if (coeff_count != 0 )
            MPI_Type_free(&coeff_index_dt);

        // Free all allocated memory
        free_all(cor, blocklens, indices, mean_value_vectorX, Sxx_vector, mean_value_vectorY, Syy_vector);
    }

         DEBUG("\nAbout to write to disk %d\n", rank);
    MPI_File_sync( fh ) ;   		// Causes all previous writes to be transferred to the storage device
         DEBUG("\nWritten to disk %d\n",rank);
  //  MPI_Barrier( MPI_COMM_WORLD ) ; 	// Blocks until all processes in the communicator have reached this routine.
         DEBUG("\nAfter barrier \n", rank);

    // Close file handler
    MPI_File_close(&fh);
  DEBUG("\nAfter file closed /n");
   // MPI_Barrier( MPI_COMM_WORLD ) ; 	// Blocks until all processes in the communicator have reached this routine.
      DEBUG("\nAbout to return from kernel /n");
      return 0;
}
Example #18
0
int main( int argc, char *argv[] )
{
	int numprocs, myid, server, workerid, ranks[1], 
		request, i, iter, ix, iy, done;
	long rands[CHUNKSIZE], max, in, out, totalin, totalout;
	double x, y, Pi, error, epsilon;
	MPI_Comm world, workers;
	MPI_Group world_group, worker_group;
	MPI_Status status;

	MPI_Init( &argc, &argv );
	world  = MPI_COMM_WORLD;
	MPI_Comm_size( world, &numprocs );
	MPI_Comm_rank( world, &myid );
	server = numprocs-1;	// Last process is a random server 

	/***
	 * Now Master should read epsilon from command line
	 * and distribute it to all processes.
	 */
	if (myid == 0)  // Read epsilon from command line 
		sscanf( argv[1], "%lf", &epsilon );
	MPI_Bcast( &epsilon, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD );

	/***
	 * Create new process group called world_group containing all 
	 * processes and its communicator called world
	 * and a group called worker_group containing all processes
	 * except the last one (called here server) 
	 * and its communicator called workers.
	 */
	MPI_Comm_group( world, &world_group );
	ranks[0] = server;
	MPI_Group_excl( world_group, 1, ranks, &worker_group );
	MPI_Comm_create( world, worker_group, &workers );
	MPI_Group_free( &worker_group );

	MPE_XGraph graph;
	MPE_Open_graphics(&graph,MPI_COMM_WORLD,(char*)0, -1,-1,WINDOW_SIZE,WINDOW_SIZE,MPE_GRAPH_INDEPENDENT);




	/***
	 * Server part
	 *
	 * Server should loop until request code is 0, in each iteration:
	 * - receiving request code from any slave
	 * - generating a vector of CHUNKSIZE randoms <= INT_MAX
	 * - sending vector back to slave 
	 */
	if (myid == server) {	// I am the random generator server

		do {
			MPI_Recv( &request, 1, MPI_INT, MPI_ANY_SOURCE, REQUEST,
					world, &status );
			if (request) {
				for (i = 0; i < CHUNKSIZE; ) {
					rands[i] = rand();
					if ( rands[i] <= INT_MAX ) i++;
				}
				MPI_Send( rands, CHUNKSIZE, MPI_LONG,
						status.MPI_SOURCE, REPLY, world );
			}
		}
		while( request > 0 );

	}
	/***
	 * Workers (including Master) part
	 *
	 * Worker should send initial request to server.
	 * Later, in a loop worker should:
	 * - receive vector of randoms
	 * - compute x,y point inside unit square
	 * - check (and count result) if point is inside/outside 
	 *   unit circle
	 * - sum both counts over all workers
	 * - calculate pi and its error (from "exact" value)
	 * - test if error is within epsilon limit
	 * - test continuation condition (error and max. points limit)
	 * - print pi by master only
	 * - send a request to server (all if more or master only if finish)
	 * Before finishing workers should free their communicator.
	 */ 
	else {			// I am a worker process

		request = 1;
		done = 0; 
		in = out = 0;
		max  = INT_MAX;         // max int, for normalization
		MPI_Send( &request, 1, MPI_INT, server, REQUEST, world );
		MPI_Comm_rank( workers, &workerid );
		iter = 0;
		while (!done) {
			iter++;
			request = 1;
			MPI_Recv( rands, CHUNKSIZE, MPI_LONG, server, REPLY,
					world, &status );
			for (i = 0; i < CHUNKSIZE - 1; )
			{
				x = (((double) rands[i++])/max) * 2 - 1;
				y = (((double) rands[i++])/max) * 2 - 1;
				if ( x*x + y*y < 1.0 )
				{
					MPE_Draw_point(graph,(int)(WINDOW_SIZE/2+x*WINDOW_SIZE/2),(int)(WINDOW_SIZE+y*WINDOW_SIZE/2),MPE_RED);
					in++;
				}
				else
					out++;
			}
			MPI_Allreduce( &in, &totalin, 1, MPI_LONG, MPI_SUM, workers );
			MPI_Allreduce( &out, &totalout, 1, MPI_LONG, MPI_SUM, workers );
			Pi = ( 4.0 * totalin ) / ( totalin + totalout );
			error = fabs( Pi - PI );
			done = ( error < epsilon || (totalin + totalout) > THROW_MAX );
			request = (done) ? 0 : 1;
		
			MPE_Update(graph);

			if (myid == 0)
			{
				printf( "\rpi = %23.20f", Pi );
				MPI_Send( &request, 1, MPI_INT, server, REQUEST, world );
			}
			else {
				if (request)
					MPI_Send( &request, 1, MPI_INT, server, REQUEST, world );
			}
		}
		MPI_Comm_free( &workers );
	}

	/***
	 * Master should print final point counts.
	 */
	if (myid == 0) {
		printf( "\npoints: %ld\nin: %ld, out: %ld, <ret> to exit\n",
				totalin+totalout, totalin, totalout );
		getchar();
	}

	MPE_Close_graphics(graph);
	MPI_Finalize();

	return 0;
}
Example #19
0
int main(int argc, char** argv)
{
  int my_rank, p;
  int i, dest;
  mpz_t currentPrime;
  unsigned long int product;
  sscanf(argv[1], "%lu", &product);
  int secondFactor = 0;
  int bcastStatus;
  int equals;

  /** GMP library variables **/
  mpz_t nextPrimeNumber;
  mpz_t testFactor;
  mpz_init(nextPrimeNumber);
  mpz_init_set_str (nextPrimeNumber, argv[1], 10);
  mpz_init(testFactor);
  mpz_init_set_ui(currentPrime, 2);
  mpz_nextprime(nextPrimeNumber, nextPrimeNumber);
  mpz_t testProduct;
  mpz_init(testProduct);

  /** MPI Initialization **/
  MPI_Request finalValue;
  MPI_File out;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &p);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Status status;

  /** Get Ready to receive a factor if another process finds one */
  MPI_Irecv(&secondFactor, 1, MPI_UNSIGNED_LONG, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &finalValue);
  
  /** Prepare initial offset for each process **/
  for (i=0 ; i < my_rank ; i++) {
    mpz_nextprime(currentPrime, currentPrime);
  }
  /** Start Timing **/
  double start = MPI_Wtime(), diff;
  while (!secondFactor) {
    /** Check if another process has found the factors **/
    MPI_Test (&finalValue, &bcastStatus, &status);
    if(bcastStatus) {
      /** Somebody else has found the factors, we are done **/
      MPI_Wait(&finalValue, &status);
      break;
    }
      /** Skip P primes before checking again **/
    for (i=0 ; i < p ; i++) {
      mpz_nextprime(currentPrime, currentPrime);
    }
    
    /** Brute force check if the current working prime is a factor of the input number **/
    for (mpz_set_ui(testFactor , 2) ; mpz_get_ui(testFactor) <= mpz_get_ui(currentPrime); mpz_nextprime(testFactor, testFactor)) {
      /** Check if another process has found the factors **/
      MPI_Test (&finalValue, &bcastStatus, &status);
      if(bcastStatus) {
        MPI_Wait(&finalValue, &status);
        break;
      }
      mpz_mul_ui(testProduct, currentPrime, mpz_get_ui(testFactor));
      equals = mpz_cmp_ui(testProduct, product);
      if (equals == 0){
        /** We've found the factor, find the second number, secnd it to the other processes  **/
        secondFactor = mpz_get_ui(testFactor);
        printf("done by process %d, factors are %lu and %d \n", my_rank, mpz_get_ui(currentPrime), secondFactor);
        fflush(stdout);
        for (dest = 0 ; dest < p ; dest++) {
          if (dest != my_rank) {
            MPI_Send(&secondFactor, 1, MPI_UNSIGNED_LONG, dest, 0, MPI_COMM_WORLD);
          }
        }
      }
    }
  }

  diff = MPI_Wtime() - start;
  /** End Timing **/

  /** Prepare file contents **/
  char fileName[200], fileContents[200];
  sprintf(fileName, "time_%lu", product);
  sprintf(fileContents, "%d\t%f\n", my_rank, diff);

  /** Write File **/
  MPI_File_open( MPI_COMM_WORLD, fileName, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &out );
  MPI_File_seek(out, my_rank*strlen ( fileContents ) , MPI_SEEK_SET);
  MPI_File_write_all(out , &fileContents, strlen ( fileContents ), MPI_CHAR, &status );
  MPI_File_close(&out);

  /** Fin **/
  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
  return(0);
}
Example #20
0
static void
_coupling_mpi_set_synchronize_roots(ple_coupling_mpi_set_t  *s,
                                    int                      sync_flag,
                                    double                   time_step,
                                    _mpi_double_int_t        glob_vals[])
{
  int i;
  MPI_Status status;
  int app_rank;

  int sync_root = -1;

  MPI_Comm_rank(s->app_comm, &app_rank);

  /* Return immediately if not application root */

  if (app_rank != 0 || (s->app_status[s->app_id] & PLE_COUPLING_NO_SYNC))
    return;

  /* First, sync data to root */

  for (i = 0; i < s->n_apps; i++) {
    if (! (s->app_status[i] & PLE_COUPLING_NO_SYNC)) {
      sync_root = i;
      break;
    }
  }

  if (sync_root == s->app_id) {

    for (i = 0; i < s->n_apps; i++) {
      if (s->app_status[i] & PLE_COUPLING_NO_SYNC) { /* Keep previous values */
        glob_vals[i].i = s->app_status[i];
        glob_vals[i].d = s->app_timestep[i];
      }
      else {
        if (i != sync_root)
          MPI_Recv(&(glob_vals[i]), 1, MPI_DOUBLE_INT, s->app_info[i*4],
                   _coupling_tag, s->base_comm, &status);
        else {
          glob_vals[i].i = sync_flag;
          glob_vals[i].d = time_step;
        }
      }
    }
  }
  else if (! (s->app_status[s->app_id] & PLE_COUPLING_NO_SYNC)) {
    _mpi_double_int_t send_vals;
    send_vals.i = sync_flag;
    send_vals.d = time_step;
    MPI_Send(&send_vals, 1, MPI_DOUBLE_INT, s->app_info[sync_root],
             _coupling_tag, s->base_comm);
  }

  /* Now, root sends data to all */

  if (sync_root == s->app_id) {
    for (i = 0; i < s->n_apps; i++) {
      if (i != sync_root && ! (s->app_status[i] & PLE_COUPLING_NO_SYNC))
        MPI_Send(glob_vals, s->n_apps, MPI_DOUBLE_INT, s->app_info[i*4],
                 _coupling_tag, s->base_comm);
    }
  }
  else
    MPI_Recv(glob_vals, s->n_apps, MPI_DOUBLE_INT, s->app_info[sync_root],
             _coupling_tag, s->base_comm, &status);
}
Example #21
0
PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
{
  KSP_AGMRES     *agmres  = (KSP_AGMRES*) ksp->data;
  MPI_Comm       comm;
  PetscScalar    *Qloc    = agmres->Qloc;
  PetscScalar    *sgn     = agmres->sgn;
  PetscScalar    *tloc    = agmres->tloc;
  PetscMPIInt    rank     = agmres->rank;
  PetscMPIInt    First    = agmres->First, Last = agmres->Last;
  PetscMPIInt    Iright   = agmres->Iright, Ileft = agmres->Ileft;
  PetscScalar    *y, *zloc;
  PetscErrorCode ierr;
  PetscInt       nloc,d, len, i, j;
  PetscBLASInt   bnvec,pas,blen;
  PetscInt       dpt;
  PetscReal      c, s, rho, zp, zq, yd, tt;
  MPI_Status     status;

  PetscFunctionBegin;
  ierr = PetscBLASIntCast(nvec,&bnvec);CHKERRQ(ierr);
  ierr = PetscObjectGetComm((PetscObject)ksp,&comm);CHKERRQ(ierr);
  pas  = 1;
  ierr = VecGetLocalSize(VEC_V(0), &nloc);CHKERRQ(ierr);
  ierr = PetscMalloc1(nvec, &y);CHKERRQ(ierr);
  ierr = PetscMemcpy(y, In, nvec*sizeof(PetscScalar));CHKERRQ(ierr);
  ierr = VecGetArray(Out, &zloc);CHKERRQ(ierr);

  if (rank == Last) {
    for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
  }
  for (i = 0; i < nloc; i++) zloc[i] = 0.0;
  if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
  else {
    for (d = nvec - 1; d >= 0; d--) {
      if (rank == First) {
        ierr = MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);CHKERRQ(ierr);
      } else {
        for (j = nvec - 1; j >= d + 1; j--) {
          i         = j - d;
          ierr      = KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);CHKERRQ(ierr);
          zp        = zloc[i-1];
          zq        = zloc[i];
          zloc[i-1] =     c * zp + s * zq;
          zloc[i]   =     -s * zp + c * zq;
        }
        ierr = KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);CHKERRQ(ierr);
        if (rank == Last) {
          zp      = y[d];
          zq      = zloc[0];
          y[d]    =      c * zp + s * zq;
          zloc[0] =   -s * zp + c * zq;
          ierr    = MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);CHKERRQ(ierr);
        } else {
          ierr    = MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);CHKERRQ(ierr);
          zp      = yd;
          zq      = zloc[0];
          yd      =      c * zp + s * zq;
          zloc[0] =   -s * zp + c * zq;
          ierr    = MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);CHKERRQ(ierr);
        }
      }
    }
  }
  for (j = nvec - 1; j >= 0; j--) {
    dpt = j * nloc + j;
    if (tloc[j] != 0.0) {
      len       = nloc - j;
      ierr      = PetscBLASIntCast(len,&blen);CHKERRQ(ierr); 
      rho       = Qloc[dpt];
      Qloc[dpt] = 1.0;
      tt        = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
      PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
      Qloc[dpt] = rho;
    }
  }
  ierr = VecRestoreArray(Out, &zloc);CHKERRQ(ierr);
  ierr = PetscFree(y);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}
Example #22
0
void Foam::mpiPstreamImpl::reduce(scalar& Value, const sumOp<scalar>& bop)
{
    if (!Pstream::parRun())
    {
        return;
    }

    if (Pstream::nProcs() <= Pstream::nProcsSimpleSum)
    {
        if (Pstream::master())
        {
            for
            (
                int slave=Pstream::firstSlave();
                slave<=Pstream::lastSlave();
                slave++
            )
            {
                scalar value;

                if
                (
                    MPI_Recv
                    (
                        &value,
                        1,
                        MPI_SCALAR,
                        Pstream::procID(slave),
                        Pstream::msgType(),
                        MPI_COMM_WORLD,
                        MPI_STATUS_IGNORE
                    )
                )
                {
                    FatalErrorIn
                    (
                        "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                    )   << "MPI_Recv failed"
                        << Foam::abort(FatalError);
                }

                Value = bop(Value, value);
            }
        }
        else
        {
            if
            (
                MPI_Send
                (
                    &Value,
                    1,
                    MPI_SCALAR,
                    Pstream::procID(Pstream::masterNo()),
                    Pstream::msgType(),
                    MPI_COMM_WORLD
                )
            )
            {
                FatalErrorIn
                (
                    "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                )   << "MPI_Send failed"
                    << Foam::abort(FatalError);
            }
        }


        if (Pstream::master())
        {
            for
            (
                int slave=Pstream::firstSlave();
                slave<=Pstream::lastSlave();
                slave++
            )
            {
                if
                (
                    MPI_Send
                    (
                        &Value,
                        1,
                        MPI_SCALAR,
                        Pstream::procID(slave),
                        Pstream::msgType(),
                        MPI_COMM_WORLD
                    )
                )
                {
                    FatalErrorIn
                    (
                        "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                    )   << "MPI_Send failed"
                        << Foam::abort(FatalError);
                }
            }
        }
        else
        {
            if
            (
                MPI_Recv
                (
                    &Value,
                    1,
                    MPI_SCALAR,
                    Pstream::procID(Pstream::masterNo()),
                    Pstream::msgType(),
                    MPI_COMM_WORLD,
                    MPI_STATUS_IGNORE
                )
            )
            {
                FatalErrorIn
                (
                    "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                )   << "MPI_Recv failed"
                    << Foam::abort(FatalError);
            }
        }
    }
    else
    {
        scalar sum;
        MPI_Allreduce(&Value, &sum, 1, MPI_SCALAR, MPI_SUM, MPI_COMM_WORLD);
        Value = sum;

        /*
        int myProcNo = Pstream::myProcNo();
        int nProcs = Pstream::nProcs();

        //
        // receive from children
        //
        int level = 1;
        int thisLevelOffset = 2;
        int childLevelOffset = thisLevelOffset/2;
        int childProcId = 0;

        while
        (
            (childLevelOffset < nProcs)
         && (myProcNo % thisLevelOffset) == 0
        )
        {
            childProcId = myProcNo + childLevelOffset;

            scalar value;

            if (childProcId < nProcs)
            {
                if
                (
                    MPI_Recv
                    (
                        &value,
                        1,
                        MPI_SCALAR,
                        Pstream::procID(childProcId),
                        Pstream::msgType(),
                        MPI_COMM_WORLD,
                        MPI_STATUS_IGNORE
                    )
                )
                {
                    FatalErrorIn
                    (
                        "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                    )   << "MPI_Recv failed"
                        << Foam::abort(FatalError);
                }

	            Value = bop(Value, value);
	        }

            level++;
            thisLevelOffset <<= 1;
            childLevelOffset = thisLevelOffset/2;
        }

        //
        // send and receive from parent
        //
        if (!Pstream::master())
        {
            int parentId = myProcNo - (myProcNo % thisLevelOffset);

            if
            (
                MPI_Send
                (
                    &Value,
                    1,
                    MPI_SCALAR,
                    Pstream::procID(parentId),
                    Pstream::msgType(),
                    MPI_COMM_WORLD
                )
            )
            {
                FatalErrorIn
                (
                    "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                )   << "MPI_Send failed"
                    << Foam::abort(FatalError);
            }

            if
            (
                MPI_Recv
                (
                    &Value,
                    1,
                    MPI_SCALAR,
                    Pstream::procID(parentId),
                    Pstream::msgType(),
                    MPI_COMM_WORLD,
                    MPI_STATUS_IGNORE
                )
            )
            {
                FatalErrorIn
                (
                    "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                )   << "MPI_Recv failed"
                    << Foam::abort(FatalError);
            }
        }


        //
        // distribute to my children
        //
        level--;
        thisLevelOffset >>= 1;
        childLevelOffset = thisLevelOffset/2;

        while (level > 0)
        {
            childProcId = myProcNo + childLevelOffset;

            if (childProcId < nProcs)
            {
                if
                (
                    MPI_Send
                    (
                        &Value,
                        1,
                        MPI_SCALAR,
                        Pstream::procID(childProcId),
                        Pstream::msgType(),
                        MPI_COMM_WORLD
                    )
                )
                {
                    FatalErrorIn
                    (
                        "reduce(scalar& Value, const sumOp<scalar>& sumOp)"
                    )   << "MPI_Send failed"
                        << Foam::abort(FatalError);
                }
            }

            level--;
            thisLevelOffset >>= 1;
            childLevelOffset = thisLevelOffset/2;
        }
        */
    }
}
Example #23
0
void terminate_children (int nproc)
{
    // MPI_Abort(MPI_COMM_WORLD, 0);
    for (int i = 1; i < nproc; i++)
        MPI_Send(NULL, 0, MPI_DOUBLE, i, exit_tag, MPI_COMM_WORLD);
}
Example #24
0
int main(int argc, char *argv[] ) {
	double time1, time2;
	
	time1 = MPI_Wtime();

	int rank, processors;
	
	int j;	// number of iterations
	int k;	// number of iterations to perform before creating a checkpoint
	int l;  // number of random samples per grid point
	int checkpoint_resume = 0;	// 1 = resume from last checkpoint

	int c;		// used to hold a character
	int i=0, row = 0, col = 0, pln = 0;	// array iterators

	char ***local_array;		   
	char **local_array_2nd;		   
	char *local_array_pointer; 

	char ***local_array_copy;		   
	char **local_array_copy_2nd;		   
	char *local_array_copy_pointer; 

	char ***temp, *temp_pointer;
	
	int file_open_error;
	int command_line_incomplete = 0;

	int grid_size[3] 	  = {0,0,0};
	int proc_size[3] 	  = {0,0,0};
	int local_size[3] 	  = {0,0,0};
	int remainder_size[3] = {0,0,0};
	int coords[3] 		  = {0,0,0};
	int start_indices[3]  = {0,0,0};
	int periods[3]        = {0,0,0};
	int mem_size[3]       = {0,0,0};
	
	MPI_Status status;
	MPI_Datatype filetype, memtype;
	MPI_File fh;
	
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &processors);	
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);	

	// Interpret the command line arguments --------------------------------
  	if (rank == 0) {  	  	
		
		if (argc < 6 || argc > 8) {
			fputs("usage: x y z j k l r\n", stderr);
			fputs("where: x,y,z = x, y and z dimensions\n", stderr);
			fputs("       j = how many times the game of life is played\n", stderr);
			fputs("       k = checkpoint every k iterations\n", stderr);
			fputs("       l = number of random samples per grid point\n", stderr);
			fputs("       r = resume from the last checkpoint\n", stderr);
			fputs(INITIAL, stderr);
			fputs(" must be present.\n", stderr);
			fputs(CHECKPOINT, stderr);
			fputs(" must be present if resuming from the last checkpoint.\n", stderr);
			exit(EXIT_FAILURE);
		}

  	}

	j = (int) strtol(argv[4], NULL, 10);
	k = (int) strtol(argv[5], NULL, 10);
	l = (int) strtol(argv[6], NULL, 10);		
	if ( argc == 7 )
		if ( argv[6][0] == 'r' )
			checkpoint_resume = 1;			

	if (rank == 0)
		printf("%d iterations \ncheckpoint every %d iterations \n%d samples per grid point \ncheckpoint resume = %d\n", j,k,l,checkpoint_resume);				
	
	grid_size[0] = (int) strtol(argv[1], NULL, 10);
	grid_size[1] = (int) strtol(argv[2], NULL, 10);
	grid_size[2] = (int) strtol(argv[3], NULL, 10);
	if (rank==0) printf("grid_size: %d, %d, %d\n", grid_size[0], grid_size[1], grid_size[2]);

	MPI_Dims_create(processors, 3, proc_size);
	if (rank==0) printf("proc_size: %d, %d, %d\n", proc_size[0], proc_size[1], proc_size[2]);

	local_size[0] = grid_size[0] / proc_size[0];
	local_size[1] = grid_size[1] / proc_size[1];
	local_size[2] = grid_size[2] / proc_size[2];
	if (rank==0) printf("local_size: %d, %d, %d\n", local_size[0], local_size[1], local_size[2]);

	remainder_size[0] = grid_size[0] % proc_size[0];
	remainder_size[1] = grid_size[1] % proc_size[1];
	remainder_size[2] = grid_size[2] % proc_size[2];
	if (rank==0) printf("remainder_size: %d, %d, %d\n", remainder_size[0], remainder_size[1], remainder_size[2]);
	if (remainder_size[0] != 0 || remainder_size[1] != 0 || remainder_size[2] != 0) {
		fputs("remainder size != 0, check your dimensions", stderr);
		MPI_Finalize();
		exit(EXIT_FAILURE);
	}

	MPI_Comm comm;
	MPI_Cart_create(MPI_COMM_WORLD, 3, proc_size, periods, 0, &comm);
	MPI_Comm_rank(comm, &rank);
	MPI_Cart_coords(comm, rank, 3, coords);

	start_indices[0] = coords[0] * local_size[0];
	start_indices[1] = coords[1] * local_size[1];
	start_indices[2] = coords[2] * local_size[2];

/*	printf("A coords R%d: (%d, %d, %d)  (%d, %d, %d)\n", rank, coords[0], coords[1], coords[2], start_indices[0], start_indices[1], start_indices[2]);*/
	fflush(stdout);
	
	// create the file type ---------------------------------------------------
	MPI_Type_create_subarray(3, grid_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &filetype); 
	MPI_Type_commit(&filetype);
	
	// create a local memory type with ghost rows -----------------------------
	mem_size[0] = local_size[0] + 2; 
	mem_size[1] = local_size[1] + 2; 
	mem_size[2] = local_size[2] + 2; 
	start_indices[0] = start_indices[1] = start_indices[2] = 1;
	
	MPI_Type_create_subarray(3, mem_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &memtype);
	MPI_Type_commit(&memtype);
	
	// find my neighbors ------------------------------------------------------

	int nxminus, nxplus, nyminus, nyplus, nzminus, nzplus, tag = 333, *neighbors;

	// Neighbors Array:  row-  col-  col+  row+  plane-  plane+

	neighbors = (int *) malloc(6 * sizeof(int));
	for(i=0; i<6; i++)
		neighbors[i] = rank;

	MPI_Cart_shift(comm, 0, 1, &nxminus, &nxplus);
	MPI_Cart_shift(comm, 1, 1, &nyminus, &nyplus);
	MPI_Cart_shift(comm, 2, 1, &nzminus, &nzplus);

//	printf(" %d sending south to %d receiving from %d \n",rank,nxplus,nxminus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nxplus, tag, 
		&(neighbors[0]), 1, MPI_INT, nxminus, tag, comm, &status);

//	printf(" %d sending North to %d receiving from %d \n",rank,nxminus,nxplus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nxminus, tag, 
		&(neighbors[3]), 1, MPI_INT, nxplus, tag, comm, &status);

//	printf(" %d sending East to %d receiving from %d \n",rank,nyplus,nyminus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nyplus, tag, 
		&neighbors[1], 1, MPI_INT, nyminus, tag, comm, &status);

//	printf(" %d sending West to %d receiving from %d \n",rank,nyminus,nyplus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nyminus, tag, 
		&neighbors[2], 1, MPI_INT, nyplus, tag, comm, &status);

//	printf(" %d sending backwards to %d receiving from %d \n",rank,nzplus,nzminus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nzplus, tag, 
		&(neighbors[4]), 1, MPI_INT, nzminus, tag, comm, &status);

//	printf(" %d sending forward to %d receiving from %d \n",rank,nzminus,nzplus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nzminus, tag, 
		&(neighbors[5]), 1, MPI_INT, nzplus, tag, comm, &status);

/*	printf("neighboors R%d : (row-) %d (col-) %d (col+) %d (row+) %d (plane-) %d (plane+) %d\n",rank,neighbors[0],neighbors[1],neighbors[2],neighbors[3],neighbors[4],neighbors[5]);*/
	fflush(stdout);	

	//init_sprng(1,time(0),SPRNG_DEFAULT);
	srand((unsigned int)time(NULL));
		
	// Open the initial condition (checkpoint or not) ----------------------

	if ( checkpoint_resume ) {
		file_open_error = 
		MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
		MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
	}
	else {
		file_open_error = 
		MPI_File_open(MPI_COMM_WORLD, INITIAL, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
		MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
	}
	if (file_open_error != MPI_SUCCESS) {
		if (checkpoint_resume)
			fputs(CHECKPOINT, stderr);
		else
			fputs(INITIAL, stderr);
		fputs(" could not be opened.\n", stderr);
		exit(EXIT_FAILURE);
	}

	// Allocate and Populate the local array ----------------------------------
	
	local_array_copy_pointer = (char *)   malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char));
	local_array_copy_2nd     = (char **)  malloc(mem_size[0] * mem_size[1] * sizeof(char*));
	local_array_copy         = (char ***) malloc(mem_size[0] * sizeof(char*));
	for(i = 0; i < mem_size[0] * mem_size[1]; i++)
		local_array_copy_2nd[i] = &local_array_copy_pointer[i * mem_size[2]];
	for(i = 0; i < mem_size[0]; i++)
		local_array_copy[i] = &local_array_copy_2nd[i * mem_size[1]];

	local_array_pointer = (char *)   malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char));
	local_array_2nd  	= (char **)  malloc(mem_size[0] * mem_size[1] * sizeof(char*));
	local_array			= (char ***) malloc(mem_size[0] * sizeof(char*));
	for(i = 0; i < mem_size[0] * mem_size[1]; i++)
		local_array_2nd[i] = &local_array_pointer[i * mem_size[2]];
	for(i = 0; i < mem_size[0]; i++)
		local_array[i] = &local_array_2nd[i * mem_size[1]];
	
	// if (rank==0) printf("Malloc complete\n");
	
	for(row=0; row<mem_size[0]; row++) {
		for(col=0; col<mem_size[1]; col++) {
			for(pln=0; pln<mem_size[2]; pln++) {
				local_array[row][col][pln] = local_array_copy[row][col][pln] = '0';
			}
		}
	}
	
	// if (rank==0) printf("Setup complete\n");

	MPI_File_read_all(fh, local_array_pointer, 1, memtype, &status);

	if (rank==0) printf("File Read\n");
	
//	if (rank==0) {
//	for(row=0; row<mem_size[0]; row++) {
//		for(col=0; col<mem_size[1]; col++) {
//			for(pln=0; pln<mem_size[2]; pln++) {
//				printf("%c", local_array[row][col][pln]);
//			}
//			printf("\n");
//		}
//		printf("-----------------------\n");
//	}
//	}
	
	MPI_File_close(&fh);
		
	// Construct the plane data types
	
	MPI_Datatype yzplane;
	MPI_Type_vector(local_size[1], local_size[2], local_size[2]+2, MPI_CHAR, &yzplane);
	MPI_Type_commit(&yzplane);

	MPI_Datatype xzplane;
	MPI_Type_vector(local_size[0], local_size[2], ((local_size[2]+2)*local_size[1])+((local_size[2]+2)*2), MPI_CHAR, &xzplane);
	MPI_Type_commit(&xzplane);

	// this type will also copy the corner x columns, can't skip blocks intermittently
	// since we aren't worrying about the corner data, it's ok
	MPI_Datatype xyplane; 
	MPI_Type_vector((local_size[0]*local_size[1])+((local_size[0]*2)-2), 1, local_size[2]+2, MPI_CHAR, &xyplane);
	MPI_Type_commit(&xyplane);
					
	MPI_Barrier(comm);
	
	// start the iteration loop
	
	int iterations;
	int kCounter = k;
	for (iterations = 0; iterations < j; iterations++) {

		// send updated planes
		// Neighbors Array:  
		// 0     1     2     3     4       5
		// row-  col-  col+  row+  plane-  plane+
		// Note: corners are not handled
		
		// send top yzplane
		if (rank != neighbors[0]) MPI_Send(&local_array[1][1][1], 1, yzplane, neighbors[0], 0, comm);
		// recv bottom yzplane
		if (rank != neighbors[3]) MPI_Recv(&local_array[local_size[0]+1][1][1], 1, yzplane, neighbors[3], 0, comm, &status);

		// send bottom yzplane
		if (rank != neighbors[3]) MPI_Send(&local_array[local_size[0]][1][1], 1, yzplane, neighbors[3], 0, comm);
		// recv top yzplane
		if (rank != neighbors[0]) MPI_Recv(&local_array[0][1][1], 1, yzplane, neighbors[0], 0, comm, &status);

		// send left xzplane
		if (rank != neighbors[1]) MPI_Send(&local_array[1][1][1], 1, xzplane, neighbors[1], 0, comm);
		// recv right xzplane
		if (rank != neighbors[2]) MPI_Recv(&local_array[1][local_size[1]+1][1], 1, xzplane, neighbors[2], 0, comm, &status);

		// send right xzplane
		if (rank != neighbors[2]) MPI_Send(&local_array[1][local_size[1]][1], 1, xzplane, neighbors[2], 0, comm);
		// recv left xzplane
		if (rank != neighbors[1]) MPI_Recv(&local_array[1][0][1], 1, xzplane, neighbors[1], 0, comm, &status);

		// send front xyplane
		if (rank != neighbors[4]) MPI_Send(&local_array[1][1][1], 1, xyplane, neighbors[4], 0, comm);
		// recv back xyplane
		if (rank != neighbors[5]) MPI_Recv(&local_array[1][1][local_size[2]+1], 1, xyplane, neighbors[5], 0, comm, &status);

		// send back xyplane
		if (rank != neighbors[5]) MPI_Send(&local_array[1][1][local_size[2]], 1, xyplane, neighbors[5], 0, comm);
		// recv front xyplane
		if (rank != neighbors[4]) MPI_Recv(&local_array[1][1][0], 1, xyplane, neighbors[4], 0, comm, &status);

//		if (rank==0) {
//		for(row=0; row<mem_size[0]; row++) {
//			for(col=0; col<mem_size[1]; col++) {
//				for(pln=0; pln<mem_size[2]; pln++) {
//					printf("%c", local_array[row][col][pln]);
//				}
//				printf("\n");
//			}
//			printf("-----------------------\n");
//		}
//		}
		
		// run the game of life
		
		// gameOfLife(local_array, local_array_copy, local_size[0], local_size[1], l, rank);
		
		// swap the arrays
		
//		temp1 = local_array;
//		local_array = local_array_copy;
//		local_array_copy = temp1;
//
//		temp2 = local_array_pointer;
//		local_array_pointer = local_array_copy_pointer;
//		local_array_copy_pointer = temp2;		
	
		// check to see if this iteration needs a checkpoint
		
		kCounter--;
		if (kCounter == 0) {
			kCounter = k;
			
			// checkpoint code
			
			MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
			MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
			
			MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status);
			
			MPI_File_close(&fh); 	

			if (rank == 0)
				printf("Checkpoint made: Iteration %d\n", iterations+1);
			
		} // end if kCounter == 0 
	} // end iteration loop	
	iterations--;
	
	// all done! repeat the checkpoint process
	
	MPI_File_open(MPI_COMM_WORLD, FINAL_RESULTS, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
	MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
	
	MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status);
	
	MPI_File_close(&fh); 	

	if (rank == 0)
		printf("Final Results made: Iteration %d\n", iterations+1);
	
	time2 = MPI_Wtime();
	if (rank == 0)
	    printf("Elapsed Seconds: %f\n", time2-time1);fflush(stdout);
	
	MPI_Finalize(); 
	return EXIT_SUCCESS; 
}
Example #25
0
/******************************************************************************
*
*   The main worker node function.
*
*   int thread_id: the thread_id
*   char *fastq1: FIFO from which bowtie2 can get read1
*   char *fastq2: FIFO from which bowtie2 can get read2 (if it exists)
*
*******************************************************************************/
void herd_worker_node(int thread_id, char *fastq1, char *fastq2) {
    int cmd_length = 1, max_qname = 0, status, strand;
    char *cmd, *last_qname = calloc(1, sizeof(char));
    MPI_Header *packed_header;
    MPI_read *packed_read = calloc(1, sizeof(MPI_read));
    bam_hdr_t *header;
    bam1_t *read1 = bam_init1();
    bam1_t *read2 = bam_init1();
    samFile *fp;
#ifdef DEBUG
    MPI_Status stat;
    int current_p_size = 100;
    htsFile *of;
    bam_hdr_t *debug_header = bam_hdr_init();
    bam1_t *debug_read = bam_init1();
    global_header = bam_hdr_init();
    void *p = calloc(100,1);
    char *oname = NULL;
#else
    int i = 0;
#endif
    time_t t0, t1;
    int swapped = 0;
    assert(last_qname);
    assert(packed_read);

    //Which strand should we be aligning to?
    if(config.directional) {
        strand = (thread_id-1) % 2;
    } else {
        strand = (thread_id-1) % 4;
    }

    packed_read->size = 0;
    packed_read->packed = NULL;

    //construct the bowtie2 command
    cmd_length += (int) strlen("bowtie2 -q --reorder") + 1;
    cmd_length += (int) strlen(config.bowtie2_options) + 1;
    cmd_length += (int) strlen("--norc -x") + 1;
    cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1;
    cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3;
    if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded.

#ifdef DEBUG
    oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam")));
    assert(oname);
    sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id);
    if(!config.quiet) fprintf(stderr, "Writing output to %s\n", oname);
    of = sam_open(oname, "wb");
    free(oname);
#endif

    cmd = (char *) malloc(sizeof(char) * cmd_length);
    assert(cmd);
    if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand
        if(config.paired) {
            sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
        } else {
            sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1);
        }
    } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand
        if(config.paired) {
            sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
        } else {
            sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1);
        }
    } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand
        if(config.paired) {
            sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
        } else {
            sprintf(cmd, "bowtie2 -q --reorder %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1);
        }
    } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand
        if(config.paired) {
            sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
        } else {
            sprintf(cmd, "bowtie2 -q --reorder %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1);
        }
    } else {
        fprintf(stderr, "Oh shit, got strand %i!\n", strand);
        return;
    }

    //Start the process
    if(!config.quiet) fprintf(stderr, "Node %i executing: %s\n", thread_id, cmd); fflush(stderr);
    fp = sam_popen(cmd);
    header = sam_hdr_read(fp);
#ifdef DEBUG
    sam_hdr_write(of, header);
#endif

#ifndef DEBUG
    packed_header = pack_header(header);
    if(thread_id == 1) {
        //Send the header
        MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
        status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD);
        if(status != MPI_SUCCESS) {
            fprintf(stderr, "MPI_Send returned %i\n", status);
            fflush(stderr);
        }
    }
#else
    packed_header = pack_header(header);
    void *tmp_pointer = malloc(packed_header->size);
    assert(tmp_pointer);
    MPI_Request request;
    MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request);
    status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat);
    if(status != MPI_SUCCESS) fprintf(stderr, "We seem to have not been able to send the message to ourselves!\n");
    MPI_Wait(&request, &stat);
    unpack_header(debug_header, tmp_pointer);
    global_header = debug_header;
    free(tmp_pointer);
#endif

    t0 = time(NULL);
    if(!config.quiet) fprintf(stderr, "Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stderr);
    while(sam_read1(fp, header, read1) >= 0) {
#ifdef DEBUG
        sam_write1(of, global_header, read1);
#endif
        if(strcmp(bam_get_qname(read1), last_qname) == 0) { //Multimapper
            if(config.paired) {
                sam_read1(fp, header, read2);
#ifdef DEBUG
                sam_write1(of, global_header, read2);
#endif
            }
            continue;
        } else {
            if(read1->core.l_qname > max_qname) {
                max_qname = read1->core.l_qname + 10;
                last_qname = realloc(last_qname, sizeof(char) * max_qname);
                assert(last_qname);
            }
            strcpy(last_qname, bam_get_qname(read1));
        }

        //Are paired-end reads in the wrong order?
        swapped = 0;
        if(config.paired) {
            if(read1->core.flag & BAM_FREAD2) {
                swapped = 1;
                sam_read1(fp, header, read2);
                packed_read = pack_read(read2, packed_read);
#ifndef DEBUG
                MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
#else
                sam_write1(of, global_header, read2);
                if(packed_read->size > current_p_size) {
                    p = realloc(p, packed_read->size);
                    assert(p);
                }
                MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request);
                status = MPI_Recv(p, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat);
                MPI_Wait(&request, &stat);
                debug_read = unpack_read(debug_read, p);
#endif
            }
        }

        //Send the read
        packed_read = pack_read(read1, packed_read);
#ifndef DEBUG
        MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
#else
        if(packed_read->size > current_p_size) {
            p = realloc(p, packed_read->size);
            assert(p);
        }
        MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request);
        status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat);
        MPI_Wait(&request, &stat);
#endif
        //Deal with paired-end reads
        if(config.paired && !swapped) {
            sam_read1(fp, header, read2);
            packed_read = pack_read(read2, packed_read);
#ifndef DEBUG
            MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
#else
            sam_write1(of, global_header, read2);
            if(packed_read->size > current_p_size) {
                p = realloc(p, packed_read->size);
                assert(p);
            }
            MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request);
            status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat);
            MPI_Wait(&request, &stat);
            debug_read = unpack_read(debug_read, p);
#endif
        }
#ifndef DEBUG
        i++;
#endif
    }
    t1 = time(NULL);
    if(!config.quiet) fprintf(stderr, "Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stderr);

    //Notify the master node
    packed_read->size = 0;
#ifndef DEBUG
    void *A = malloc(1);
    assert(A);
    MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
    free(A);
#endif

    //Close things up
    bam_hdr_destroy(header);
    bam_destroy1(read1);
    bam_destroy1(read2);
    free(cmd);
    if(packed_read->packed != NULL) free(packed_read->packed);
    free(packed_read);
    if(packed_header->packed != NULL) free(packed_header->packed);
    free(packed_header);
    free(last_qname);
    sam_pclose(fp);
    //Remove the FIFO(s)
    unlink(fastq1);
    if(config.paired) unlink(fastq2);
#ifdef DEBUG
    sam_close(of);
    bam_hdr_destroy(debug_header);
    bam_destroy1(debug_read);
    free(p);
#endif
    if(!config.quiet) fprintf(stderr, "Exiting worker node %i\n", thread_id); fflush(stderr);
};
Example #26
0
/*! \brief
 *
 * <pre>
 * Purpose
 * =======
 *
 * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a
 * graph partitioning algorithm to the symmetrized graph A+A'.  The
 * multilevel graph partitioning algorithm used is the
 * ParMETIS_V3_NodeND routine available in the parallel graph
 * partitioning package parMETIS.  
 *
 * The number of independent sub-domains noDomains computed by this
 * algorithm has to be a power of 2.  Hence noDomains is the larger
 * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow
 * * npcol is the number of processors used in SuperLU_DIST.
 *
 * Arguments
 * =========
 *
 * A       (input) SuperMatrix*
 *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
 *         of the linear equations is A->nrow.  Matrix A is distributed
 *         in NRformat_loc format.
 *
 * perm_r  (input) int_t*
 *         Row permutation vector of size A->nrow, which defines the 
 *         permutation matrix Pr; perm_r[i] = j means row i of A is in 
 *         position j in Pr*A.
 *
 * perm_c  (output) int_t*
 *	   Column permutation vector of size A->ncol, which defines the 
 *         permutation matrix Pc; perm_c[i] = j means column i of A is 
 *         in position j in A*Pc.
 *
 * nprocs_i (input) int*
 *         Number of processors the input matrix is distributed on in a block
 *         row format.  It corresponds to number of processors used in
 *         SuperLU_DIST.
 *
 * noDomains (input) int*, must be power of 2
 *         Number of independent domains to be computed by the graph
 *         partitioning algorithm.  ( noDomains <= nprocs_i )
 *
 * sizes   (output) int_t**, of size 2 * noDomains
 *         Returns pointer to an array containing the number of nodes
 *         for each sub-domain and each separator.  Separators are stored 
 *         from left to right.
 *         Memory for the array is allocated in this routine.
 *
 * fstVtxSep (output) int_t**, of size 2 * noDomains
 *         Returns pointer to an array containing first node for each
 *         sub-domain and each separator.
 *         Memory for the array is allocated in this routine.
 *
 * Return value
 * ============
 *   < 0, number of bytes allocated on return from the symbolic factorization.
 *   > 0, number of bytes allocated when out of memory.
 * </pre>
 */
float
get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
		     int nprocs_i, int noDomains, 
		     int_t **sizes, int_t **fstVtxSep,
		     gridinfo_t *grid, MPI_Comm *metis_comm)

{
  NRformat_loc *Astore;
  int   iam, p;
  int   *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int;
  int   *options, numflag;
  int_t m_loc, nnz_loc, fst_row;
  int_t m, n, bnz, i, j;
  int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
  int_t *b_rowptr, *b_colind;
  int_t *dist_order;
  int  *recvcnts, *displs;
  /* first row index on each processor when the matrix is distributed
     on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
  int_t  *vtxdist_i, *vtxdist_o; 
  int_t szSep, k, noNodes;
  float apat_mem_l; /* memory used during the computation of the graph of A+A' */
  float mem;  /* Memory used during this routine */
  MPI_Status status;

  /* Initialization. */
  MPI_Comm_rank (grid->comm, &iam);
  n = A->ncol;
  m = A->nrow;
  if ( m != n ) ABORT("Matrix is not square");
  mem = 0.;

#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");
#endif

  Astore = (NRformat_loc *) A->Store;
  nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */
  m_loc = Astore->m_loc;     /* number of rows local to this processor */
  fst_row = Astore->fst_row; /* global index of the first row */
  rowptr = Astore->rowptr;   /* pointer to rows and column indices */
  colind = Astore->colind;
  
#if ( PRNTlevel>=1 )
  if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n",
		     noDomains);
#endif

  numflag = 0;
  /* determine first row on each processor */
  vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
  if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i.");
  vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
  if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o.");

  MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t,
		 grid->comm);
  vtxdist_i[nprocs_i] = m;

  if (noDomains == nprocs_i) {
    /* keep the same distribution of A */
    for (p = 0; p <= nprocs_i; p++)
      vtxdist_o[p] = vtxdist_i[p];
  }
  else {
    i = n / noDomains;
    j = n % noDomains;
    for (k = 0, p = 0; p < noDomains; p++) {
      vtxdist_o[p] = k;
      k += i;
      if (p < j)  k++;
    }
    /* The remaining non-participating processors get the same 
       first-row-number as the last processor.   */
    for (p = noDomains; p <= nprocs_i; p++)
      vtxdist_o[p] = k;
  }

#if ( DEBUGlevel>=2 )
  if (!iam)
    PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);
#endif  

  /* Compute distributed A + A' */
  if ((apat_mem_l = 
       a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i,
			     n, rowptr, colind, noDomains, vtxdist_o,
			     &bnz, &b_rowptr, &b_colind, grid)) > 0)
    return (apat_mem_l);
  mem += -apat_mem_l;
  
  /* Initialize and allocate storage for parMetis. */    
  (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
  if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes.");
  l_sizes = *sizes;
  (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
  if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
  l_fstVtxSep = *fstVtxSep;
  m_loc = vtxdist_o[iam+1] - vtxdist_o[iam];
  
  if ( iam < noDomains) 
    /* dist_order_int is the perm returned by parMetis, distributed */
    if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int))))
      ABORT("SUPERLU_MALLOC fails for dist_order_int.");

  /* ParMETIS represents the column pointers and row indices of *
   * the input matrix using integers. When SuperLU_DIST uses    *
   * long int for the int_t type, then several supplementary    *
   * copies need to be performed in order to call ParMETIS.     */
#if defined (_LONGINT)
  l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int));
  if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int.");
  
  /* Allocate storage */
  if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int))))
    ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]");
  for (i = 0; i <= m_loc; i++)
    b_rowptr_int[i] = b_rowptr[i];
  SUPERLU_FREE (b_rowptr);
  
  if ( bnz ) {
    if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int))))
      ABORT("SUPERLU_MALLOC fails for b_colind_int[]");
    for (i = 0; i < bnz; i++)
      b_colind_int[i] = b_colind[i];
    SUPERLU_FREE (b_colind);
  }
  
  if ( !(vtxdist_o_int = 
	 (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int))))
    ABORT("SUPERLU_MALLOC fails for vtxdist_o_int.");
  for (i = 0; i <= nprocs_i; i++)
    vtxdist_o_int[i] = vtxdist_o[i];
  SUPERLU_FREE (vtxdist_o);

#else  /* Default */

  vtxdist_o_int = vtxdist_o;
  b_rowptr_int = b_rowptr; b_colind_int = b_colind;
  l_sizes_int = l_sizes;

#endif
    
  if ( iam < noDomains) {
    options = (int *) SUPERLU_MALLOC(4 * sizeof(int));
    options[0] = 0;
    options[1] = 0;
    options[2] = 0;
    options[3] = 1;

    ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, 
		       &numflag, options,
		       dist_order_int, l_sizes_int, metis_comm);
  }
  
  if (bnz) 
    SUPERLU_FREE (b_colind_int);
  if ( iam < noDomains) {
    SUPERLU_FREE (options);
  }
  SUPERLU_FREE (b_rowptr_int);
  
#if defined (_LONGINT)
  /* Copy data from dist_order_int to dist_order */
  if ( iam < noDomains) {
    /* dist_order is the perm returned by parMetis, distributed */
    if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
      ABORT("SUPERLU_MALLOC fails for dist_order.");
    for (i = 0; i < m_loc; i++)
      dist_order[i] = dist_order_int[i];
    SUPERLU_FREE(dist_order_int);
    
    for (i = 0; i < 2*noDomains; i++)
      l_sizes[i] = l_sizes_int[i];
    SUPERLU_FREE(l_sizes_int);
  }
#else 
  dist_order = dist_order_int;
#endif
  
  /* Allgatherv dist_order to get perm_c */
  if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
    ABORT ("SUPERLU_MALLOC fails for displs.");
  if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
    ABORT ("SUPERLU_MALLOC fails for recvcnts.");
  for (i = 0; i < nprocs_i; i++)
    recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i];
  displs[0]=0;
  for(i=1; i < nprocs_i; i++) 
    displs[i] = displs[i-1] + recvcnts[i-1];
  
  MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, 
		  mpi_int_t, grid->comm);

  if ( iam < noDomains) {
    SUPERLU_FREE (dist_order);
  }
  SUPERLU_FREE (vtxdist_i);
  SUPERLU_FREE (vtxdist_o_int);
  SUPERLU_FREE (recvcnts);
  SUPERLU_FREE (displs);
  
  /* send l_sizes to every processor p >= noDomains */
  if (!iam)
    for (p = noDomains; p < nprocs_i; p++)
      MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm);
  if (noDomains <= iam && iam < nprocs_i)
    MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm,
	      &status);
  
  /* Determine the first node in each separator, store it in l_fstVtxSep */  
  for (j = 0; j < 2 * noDomains; j++)
    l_fstVtxSep[j] = 0;
  l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2];
  szSep = noDomains;
  i = 0;
  while (szSep != 1) {
    for (j = i; j < i + szSep; j++) {
      l_fstVtxSep[j] += l_sizes[j]; 	      
    }
    for (j = i; j < i + szSep; j++) {
      k = i + szSep + (j-i) / 2;
      l_fstVtxSep[k] += l_fstVtxSep[j]; 
    }
    i += szSep;
    szSep = szSep / 2;
  }
  
  l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2];
  i = 2 * noDomains - 2;
  szSep = 1;
  while (i > 0) {
    for (j = i; j < i + szSep; j++) {
      k = (i - 2 * szSep) + (j-i) * 2 + 1;
      noNodes = l_fstVtxSep[k];
      l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k];
      l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - 
	noNodes - l_sizes[k-1];
    }
    szSep *= 2;
    i -= szSep;
  }

#if ( PRNTlevel>=2 )
  if (!iam ) {
    PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes);
    PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep);
  }
#endif

#if ( DEBUGlevel>=1 )
  CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");
#endif
  
  return (-mem);

} /* get_perm_c_parmetis */
static int f(realtype t, N_Vector u, N_Vector udot, void *user_data)
{
  realtype uLeft, uRight, ui, ult, urt;
  realtype hordc, horac, hdiff, hadv;
  realtype *udata, *dudata;
  long int i, my_length;
  int npes, my_pe, my_pe_m1, my_pe_p1, last_pe, my_last;
  UserData data;
  MPI_Status status;
  MPI_Comm comm;

  /* Extract MPI info. from data */
  data = (UserData) user_data;
  comm = data->comm;
  npes = data->npes;
  my_pe = data->my_pe;
  
  /* If this process is inactive, return now */
  if (my_pe == npes) return(0);

  /* Extract problem constants from data */
  hordc = data->hdcoef;
  horac = data->hacoef;

  /* Find related processes */
  my_pe_m1 = my_pe - 1;
  my_pe_p1 = my_pe + 1;
  last_pe = npes - 1;

  /* Obtain local arrays */
  udata = NV_DATA_P(u);
  dudata = NV_DATA_P(udot);
  my_length = NV_LOCLENGTH_P(u);
  my_last = my_length - 1;

  /* Pass needed data to processes before and after current process. */
   if (my_pe != 0)
     MPI_Send(&udata[0], 1, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm);
   if (my_pe != last_pe)
     MPI_Send(&udata[my_length-1], 1, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm);   

  /* Receive needed data from processes before and after current process. */
   if (my_pe != 0)
     MPI_Recv(&uLeft, 1, PVEC_REAL_MPI_TYPE, my_pe_m1, 0, comm, &status);
   else uLeft = ZERO;
   if (my_pe != last_pe)
     MPI_Recv(&uRight, 1, PVEC_REAL_MPI_TYPE, my_pe_p1, 0, comm,
              &status);   
   else uRight = ZERO;

  /* Loop over all grid points in current process. */
  for (i=0; i<my_length; i++) {

    /* Extract u at x_i and two neighboring points */
    ui = udata[i];
    ult = (i==0) ? uLeft: udata[i-1];
    urt = (i==my_length-1) ? uRight : udata[i+1];

    /* Set diffusion and advection terms and load into udot */
    hdiff = hordc*(ult - TWO*ui + urt);
    hadv = horac*(urt - ult);
    dudata[i] = hdiff + hadv;
  }

  return(0);
}
Example #28
0
int main(int argc, char *argv[])
{
  MPI_Status status;
  int num, rank, size, tag, next, from;

  if (argc != 2) {
    printf("appel : nom du programme nbre de tours \n");
    exit(-1);
  }

  /* Start up MPI */

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  /* Arbitrarily choose 201 to be our tag.  Calculate the */
  /* rank of the next process in the ring.  Use the modulus */
  /* operator so that the last process "wraps around" to rank */
  /* zero. */

  tag = 201;
  next = (rank + 1) % size;
  from = (rank + size - 1) % size;

  /* If we are the "console" process, get a integer from the */
  /* user to specify how many times we want to go around the */
  /* ring */

  if (rank == 0) {
    num = atoi (argv[1]);

    printf("Process %d sending %d to %d\n", rank, num, next);
    MPI_Send(&num, 1, MPI_INT, next, tag, MPI_COMM_WORLD); 
  }

  /* Pass the message around the ring.  The exit mechanism works */
  /* as follows: the message (a positive integer) is passed */
  /* around the ring.  */

  while (1) {
    MPI_Recv(&num, 1, MPI_INT, from, tag, MPI_COMM_WORLD, &status);

    printf("Process %d received %d\n", rank, num);

    if (rank == 0) {
      num--;
      printf("Process 0 decremented num\n");
    }

    if (num == 0) {
      printf("Process %d exiting\n", rank);
      break;
    }

    printf("Process %d sending %d to %d\n", rank, num, next);
    MPI_Send(&num, 1, MPI_INT, next, tag, MPI_COMM_WORLD);
  }

  /* The last process does one extra send to process 0, which needs */
  /* to be received before the program can exit */

  if (rank == 0)
    MPI_Send(&num, 1, MPI_INT, next, tag, MPI_COMM_WORLD);

  /* Quit */

  MPI_Finalize();
  return 0;
}
static void PrintOutput(realtype g_val, N_Vector uB, UserData data)
{
  MPI_Comm comm;
  MPI_Status status;
  int npes, my_pe;
  long int i, Ni, indx, local_N, nperpe, nrem;
  realtype *uBdata;
  realtype *mu;

  comm = data->comm;
  npes = data->npes;
  my_pe = data->my_pe;
  local_N = data->local_N;
  nperpe = data->nperpe;
  nrem = data->nrem;

  uBdata = NV_DATA_P(uB);

  if (my_pe == npes) {

#if defined(SUNDIALS_EXTENDED_PRECISION)
    printf("\ng(tf) = %8Le\n\n", g_val);
    printf("dgdp(tf)\n  [ 1]: %8Le\n  [ 2]: %8Le\n\n", -uBdata[0], -uBdata[1]);
#elif defined(SUNDIALS_DOUBLE_PRECISION)
    printf("\ng(tf) = %8le\n\n", g_val);
    printf("dgdp(tf)\n  [ 1]: %8le\n  [ 2]: %8le\n\n", -uBdata[0], -uBdata[1]);
#else
    printf("\ng(tf) = %8e\n\n", g_val);
    printf("dgdp(tf)\n  [ 1]: %8e\n  [ 2]: %8e\n\n", -uBdata[0], -uBdata[1]);
#endif

    mu = (realtype *)malloc(NEQ*sizeof(realtype));
    if (check_flag((void *)mu, "malloc", 2, my_pe)) MPI_Abort(comm, 1);

    indx = 0;
    for ( i = 0; i < npes; i++) {
      Ni = ( i < nrem ) ? nperpe+1 : nperpe;
      MPI_Recv(&mu[indx], Ni, PVEC_REAL_MPI_TYPE, i, 0, comm, &status);
      indx += Ni;
    }

    printf("mu(t0)\n");

#if defined(SUNDIALS_EXTENDED_PRECISION)
    for (i=0; i<NEQ; i++)
      printf("  [%2ld]: %8Le\n", i+1, mu[i]);
#elif defined(SUNDIALS_DOUBLE_PRECISION)
    for (i=0; i<NEQ; i++)
      printf("  [%2ld]: %8le\n", i+1, mu[i]);
#else
    for (i=0; i<NEQ; i++)
      printf("  [%2ld]: %8e\n", i+1, mu[i]);
#endif

    free(mu);

  } else {

    MPI_Send(uBdata, local_N, PVEC_REAL_MPI_TYPE, npes, 0, comm);

  }

}
 inline void MyMPI_Send (const string & s, int dest)
 {
   MPI_Send( const_cast<char*> (s.c_str()), s.length(), MPI_CHAR, dest, 1, MPI_COMM_WORLD);
 }