Example #1
0
/* Extract an m x n submatrix within an m x N matrix and transpose it.
   Assume storage by rows; the defined datatype accesses by columns */
MPI_Datatype transpose_type(int N, int m, int n, MPI_Datatype type)
/* computes a datatype for the transpose of an mxn matrix 
   with entries of type type */
{
  MPI_Datatype subrow, subrow1, submatrix;
  MPI_Aint lb, extent;
  
  MPI_Type_vector(m, 1, N, type, &subrow);
  MPI_Type_get_extent(type, &lb, &extent);
  MPI_Type_create_resized(subrow, 0, extent, &subrow1);
  MPI_Type_contiguous(n, subrow1, &submatrix); 
  MPI_Type_commit(&submatrix);
  MPI_Type_free( &subrow );
  MPI_Type_free( &subrow1 );

  /* Add a consistency test: the size of submatrix should be
     n * m * sizeof(type) and the extent should be ((m-1)*N+n) * sizeof(type) */
  {
      int      tsize;
      MPI_Aint textent, llb;
      MPI_Type_size( type, &tsize );
      MPI_Type_get_true_extent( submatrix, &llb, &textent );
      
      if (textent != tsize * (N * (m-1)+n)) {
	  fprintf( stderr, "Transpose Submatrix extent is %ld, expected %ld (%d,%d,%d)\n",
		   (long)textent, (long)(tsize * (N * (m-1)+n)), N, n, m );
      }
  }

  return(submatrix);
}
Example #2
0
File: grid.c Project: Thundzz/TDP
double* partition_matrix(double *a,
	int N, int gd, 
	MPI_Datatype *type_block)
{
	MPI_Datatype type_block_tmp;

	int NB = N/gd;

	double* b = malloc(NB*NB*sizeof(double));

	MPI_Type_vector(NB, NB, N, MPI_DOUBLE, &type_block_tmp);
	MPI_Type_create_resized(type_block_tmp, 0, sizeof(double), type_block);
	MPI_Type_commit(type_block);

	int counts[gd*gd];
	int disps[gd*gd];
	for (int i=0; i<gd; i++) {
		for (int j=0; j<gd; j++) {
			disps[i*gd+j] = i*N*NB+j*NB;
			counts [i*gd+j] = 1;
		}
	}
	MPI_Scatterv(a, counts, disps, *type_block, b, NB*NB, MPI_DOUBLE, 0, MPI_COMM_WORLD);	

	return b;
}
Example #3
0
static MPI_Datatype
create_indexed_gap_optimized_ddt( void )
{
    MPI_Datatype dt1, dt2, dt3;
    int bLength[3];
    MPI_Datatype types[3];
    MPI_Aint displ[3];
   
    MPI_Type_contiguous( 40, MPI_BYTE, &dt1 );
    MPI_Type_create_resized( dt1, 0, 44, &dt2 );
   
    bLength[0] = 4;
    bLength[1] = 9;
    bLength[2] = 36;
   
    types[0] = MPI_BYTE;
    types[1] = dt2;
    types[2] = MPI_BYTE;

    displ[0] = 0;
    displ[1] = 8;
    displ[2] = 44 * 9 + 8;
   
    MPI_Type_create_struct( 3, bLength, displ, types, &dt3 );
   
    MPI_Type_free( &dt1 );
    MPI_Type_free( &dt2 );
    MPI_DDT_DUMP( dt3 );
    MPI_Type_commit( &dt3 );
    return dt3;
}
Example #4
0
static PetscErrorCode MakeDatatype(MPI_Datatype *dtype)
{
  PetscErrorCode ierr;
  MPI_Datatype dtypes[3],tmptype;
  PetscMPIInt  lengths[3];
  MPI_Aint     displs[3];
  Unit         dummy;

  PetscFunctionBegin;
  dtypes[0] = MPIU_INT;
  dtypes[1] = MPIU_SCALAR;
  dtypes[2] = MPI_CHAR;
  lengths[0] = 1;
  lengths[1] = 1;
  lengths[2] = 3;
  /* Curse the evil beings that made std::complex a non-POD type. */
  displs[0] = (char*)&dummy.rank - (char*)&dummy;  /* offsetof(Unit,rank); */
  displs[1] = (char*)&dummy.value - (char*)&dummy; /* offsetof(Unit,value); */
  displs[2] = (char*)&dummy.ok - (char*)&dummy;    /* offsetof(Unit,ok); */
  ierr = MPI_Type_create_struct(3,lengths,displs,dtypes,&tmptype);CHKERRQ(ierr);
  ierr = MPI_Type_commit(&tmptype);CHKERRQ(ierr);
  ierr = MPI_Type_create_resized(tmptype,0,sizeof(Unit),dtype);CHKERRQ(ierr);
  ierr = MPI_Type_commit(dtype);CHKERRQ(ierr);
  ierr = MPI_Type_free(&tmptype);CHKERRQ(ierr);
  {
    MPI_Aint lb,extent;
    ierr = MPI_Type_get_extent(*dtype,&lb,&extent);CHKERRQ(ierr);
    if (extent != sizeof(Unit)) SETERRQ2(PETSC_COMM_WORLD,PETSC_ERR_LIB,"New type has extent %d != sizeof(Unit) %d",extent,(int)sizeof(Unit));
  }
  PetscFunctionReturn(0);
}
Example #5
0
void distribute_matrix(ATYPE *root_matrix, ATYPE *local_matrix, int local_rank, int proc_size, long partition, uint N){
  int sendcounts[proc_size], displs[proc_size];
  ATYPE *sendbuffer=NULL;

  MPI_Datatype MPI_type, MPI_type2;


  int rest = N - (partition * ( proc_size - 1) );


  MPI_Type_vector(N, 1, N, ATYPE_MPI, &MPI_type2);
  MPI_Type_create_resized( MPI_type2, 0, sizeof(ATYPE), &MPI_type);
  MPI_Type_commit(&MPI_type);


  for ( int i=0 ; i<proc_size ; ++i ){
    if ( i == proc_size - 1 ) {
      sendcounts[i] = rest;
    }
    else {
      sendcounts[i] = partition;
    }
    displs[i] = i*partition;
  }

  if ( local_rank == root )
    sendbuffer = &(root_matrix[0]);

  MPI_Scatterv( sendbuffer, sendcounts, displs, MPI_type, &(local_matrix[0]), partition*N, ATYPE_MPI, root, MPI_COMM_WORLD );
  MPI_Type_free(&MPI_type);
}
Example #6
0
static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
{
  PetscErrorCode ierr;

  PetscFunctionBegin;
  if (stash->blocktype == MPI_DATATYPE_NULL) {
    PetscInt     bs2 = PetscSqr(stash->bs);
    PetscMPIInt  blocklens[2];
    MPI_Aint     displs[2];
    MPI_Datatype types[2],stype;
    /* C++ std::complex is not my favorite datatype.  Since it is not POD, we cannot use offsetof to find the offset of
     * vals.  But the layout is actually guaranteed by the standard, so we do a little dance here with struct
     * DummyBlock, substituting PetscReal for PetscComplex so that we can determine the offset.
     */
    struct DummyBlock {PetscInt row,col; PetscReal vals;};

    stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
    if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
      stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
    }
    ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);CHKERRQ(ierr);
    ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);CHKERRQ(ierr);
    ierr = PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);CHKERRQ(ierr);
    blocklens[0] = 2;
    blocklens[1] = bs2;
    displs[0] = offsetof(struct DummyBlock,row);
    displs[1] = offsetof(struct DummyBlock,vals);
    types[0] = MPIU_INT;
    types[1] = MPIU_SCALAR;
    ierr = MPI_Type_create_struct(2,blocklens,displs,types,&stype);CHKERRQ(ierr);
    ierr = MPI_Type_commit(&stype);CHKERRQ(ierr);
    ierr = MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);CHKERRQ(ierr); /* MPI-2 */
    ierr = MPI_Type_commit(&stash->blocktype);CHKERRQ(ierr);
    ierr = MPI_Type_free(&stype);CHKERRQ(ierr);
  }
Example #7
0
/* derived_resized_test()
 *
 * Tests behavior with resizing of a simple derived type.
 *
 * Returns the number of errors encountered.
 */
int derived_resized_test(void)
{
    int err, errs = 0;

    int count = 2;
    MPI_Datatype newtype, resizedtype;

    int size;
    MPI_Aint extent;

    err = MPI_Type_contiguous(count, MPI_INT, &newtype);
    if (err != MPI_SUCCESS) {
        if (verbose) {
            fprintf(stderr, "error creating type in derived_resized_test()\n");
        }
        errs++;
    }

    err = MPI_Type_create_resized(newtype,
                                  (MPI_Aint) 0, (MPI_Aint) (2 * sizeof(int) + 10), &resizedtype);

    err = MPI_Type_size(resizedtype, &size);
    if (err != MPI_SUCCESS) {
        if (verbose) {
            fprintf(stderr, "error obtaining type size in derived_resized_test()\n");
        }
        errs++;
    }

    if (size != 2 * sizeof(int)) {
        if (verbose) {
            fprintf(stderr,
                    "error: size != %d in derived_resized_test()\n", (int) (2 * sizeof(int)));
        }
        errs++;
    }

    err = MPI_Type_extent(resizedtype, &extent);
    if (err != MPI_SUCCESS) {
        if (verbose) {
            fprintf(stderr, "error obtaining type extent in derived_resized_test()\n");
        }
        errs++;
    }

    if (extent != 2 * sizeof(int) + 10) {
        if (verbose) {
            fprintf(stderr,
                    "error: invalid extent (%d) in derived_resized_test(); should be %d\n",
                    (int) extent, (int) (2 * sizeof(int) + 10));
        }
        errs++;
    }

    MPI_Type_free(&newtype);
    MPI_Type_free(&resizedtype);

    return errs;
}
Example #8
0
JNIEXPORT jlong JNICALL Java_mpi_Datatype_getResized(
        JNIEnv *env, jclass clazz, jlong oldType, jint lb, jint extent)
{
    MPI_Datatype type;
    int rc = MPI_Type_create_resized((MPI_Datatype)oldType, lb, extent, &type);
    ompi_java_exceptionCheck(env, rc);
    return (jlong)type;
}
Example #9
0
File: main.c Project: sondrele/NTNU
// Function to create and commit MPI datatypes
// Each datatype is resized to size of float, it looks like this fixes 
// segmentation fault issues.
void create_types() {
    MPI_Datatype border_row_t0;
    MPI_Type_contiguous(local_width,            // count
                        MPI_FLOAT,              // old_type
                        &border_row_t0);        // newtype_p
    MPI_Type_create_resized(border_row_t0, 0, sizeof(float), &border_row_t);
    MPI_Type_commit(&border_row_t);

    MPI_Datatype border_col_t0;
    MPI_Type_vector(local_height,               // count
                    1,                          // blocklength
                    local_width + 2,            // stride
                    MPI_FLOAT,                  // old_type
                    &border_col_t0);            // newtype_p
    MPI_Type_create_resized(border_col_t0, 0, sizeof(float), &border_col_t);
    MPI_Type_commit(&border_col_t);

    MPI_Datatype pres_and_diverg_t0;
    MPI_Type_vector(local_height,               // count
                    local_width,                // blocklength
                    imageSize + 2,              // stride
                    MPI_FLOAT,                  // old_type
                    &pres_and_diverg_t0);       // newtype_p
    MPI_Type_create_resized(pres_and_diverg_t0, 0, sizeof(float), &pres_and_diverg_t);
    MPI_Type_commit(&pres_and_diverg_t);

    MPI_Datatype local_diverg_t0;
    MPI_Type_vector(local_height,               // count
                    local_width,                // blocklength
                    local_width,                // stride
                    MPI_FLOAT,                  // old_type
                    &local_diverg_t0);          // newtype_p
    MPI_Type_create_resized(local_diverg_t0, 0, sizeof(float), &local_diverg_t);
    MPI_Type_commit(&local_diverg_t);

    MPI_Datatype local_pres_t0;
    MPI_Type_vector(local_height,               // count
                    local_width,                // blocklength
                    local_width + 2,            // stride
                    MPI_FLOAT,                  // old_type
                    &local_pres_t0);            // newtype_p
    MPI_Type_create_resized(local_pres_t0, 0, sizeof(float), &local_pres_t);
    MPI_Type_commit(&local_pres_t);
}
Example #10
0
/*---------------------------------------------------------------------
 * Function:         Build_cyclic_mpi_type
 * Purpose:          Build an MPI derived datatype that can be used with
 *                   cyclically distributed data.
 * In arg:
 *    loc_n:         The number of elements assigned to each process
 * Global out:
 *    cyclic_mpi_t:  An MPI datatype that can be used with cyclically
 *                   distributed data
 */
void Build_cyclic_mpi_type(int loc_n) {
   MPI_Datatype temp_mpi_t;
   MPI_Aint lb, extent;

   MPI_Type_vector(loc_n, 1, comm_sz, MPI_INT, &temp_mpi_t);
   MPI_Type_get_extent(MPI_INT, &lb, &extent);
   MPI_Type_create_resized(temp_mpi_t, lb, extent, &cyclic_mpi_t);
   MPI_Type_commit(&cyclic_mpi_t);

}  /* Build_cyclic_mpi_type */
Example #11
0
int main(int argc, char **argv) {

    MPI_Init(&argc, &argv);
    int p, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    char i;

    char a[ROWS*COLS];
    int NPROWS=2;  
    int NPCOLS=3;  
    int BLOCKROWS = ROWS/NPROWS; 
    int BLOCKCOLS = COLS/NPCOLS; 

    if (rank == 0) {
        for (int ii=0; ii<ROWS*COLS; ii++) {
            a[ii] = ii;
        }
    }

    if (p != NPROWS*NPCOLS) {
        fprintf(stderr,"Error: number of PEs %d != %d x %d\n", p, NPROWS, NPCOLS);
        MPI_Finalize();
        exit(-1);
    }
    char b[BLOCKROWS*BLOCKCOLS];
    for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++) b[ii] = 0;

    MPI_Datatype blocktype;
    MPI_Datatype blocktype2;

    MPI_Type_vector(BLOCKROWS, BLOCKCOLS, COLS, MPI_CHAR, &blocktype2);
    MPI_Type_create_resized( blocktype2, 0, sizeof(char), &blocktype);
    MPI_Type_commit(&blocktype);

    int disps[NPROWS*NPCOLS];
    int counts[NPROWS*NPCOLS];
    for (int ii=0; ii<NPROWS; ii++) {
        for (int jj=0; jj<NPCOLS; jj++) {
            disps[ii*NPCOLS+jj] = ii*COLS*BLOCKROWS+jj*BLOCKCOLS;
            counts [ii*NPCOLS+jj] = 1;
        }
    }

  

    MPI_Finalize();

    return 0;
}
Example #12
0
void distribute_matrix(double **matrix, double *global_mat_ptr, int *sendCounts, int *displs, int * global_size, int *local_size) {
    int start[2]= {0,0};
    double *local_ptr =&(matrix[0][0]);
    MPI_Datatype subType;
    MPI_Datatype type;

    MPI_Type_create_subarray(2, global_size, local_size, start, MPI_ORDER_C, MPI_DOUBLE, &subType);
    MPI_Type_create_resized(subType, 0, local_size[1]*sizeof(double), &type);
    MPI_Type_commit(&type);

    MPI_Scatterv(global_mat_ptr, sendCounts, displs, type, local_ptr, local_size[0]*local_size[1],
                 MPI_DOUBLE, 0, MPI_COMM_WORLD);

    MPI_Type_free(&type);
}
Example #13
0
void gather_submatrices(double **matrix, double *global_mat_ptr, int *sendCounts, int *displs, int global_a_row, int *local_size, int my_rank) {
    int start[2]= {0,0}, global_size[2]= {global_a_row,global_a_row};
    double *local_ptr =&(matrix[0][0]);

    if (my_rank == 0)
        MPI_Datatype subType;
    MPI_Datatype type;

    MPI_Type_create_subarray(2, global_size, local_size, start, MPI_ORDER_C, MPI_DOUBLE, &subType);
    MPI_Type_create_resized(subType, 0, local_size[1]*sizeof(double), &type);
    MPI_Type_commit(&type);
    //printf("Global 0 : %d global 1 : %d local 0 : %d local 1 : %d\n",global_size[0],global_size[1],local_size[0], local_size[1]);
    MPI_Gatherv(local_ptr, local_size[0]*local_size[1],MPI_DOUBLE,global_mat_ptr, sendCounts, displs, type,
                0, MPI_COMM_WORLD);

    MPI_Type_free(&type);
}
void 
avtWholeImageCompositerWithZ::InitializeMPIStuff(void)
{

#define UCH MPI_UNSIGNED_CHAR
#define FLT MPI_FLOAT
   int                lengths[] = {  1,   1,   1,   1};
   MPI_Aint     displacements[] = {  0,   0,   0,   0};
   MPI_Datatype         types[] = {FLT, UCH, UCH, UCH};
   ZFPixel_t    onePixel;
#undef UCH
#undef FLT

   // create the MPI data type for ZFPixel
   MPI_Address(&onePixel.z, &displacements[0]);
   MPI_Address(&onePixel.r, &displacements[1]);
   MPI_Address(&onePixel.g, &displacements[2]);
   MPI_Address(&onePixel.b, &displacements[3]);

   for (int i = 3; i >= 0; --i)
      displacements[i] -= displacements[0];

   MPI_Type_create_struct(4, lengths, displacements, types,
      &avtWholeImageCompositerWithZ::mpiTypeZFPixel);

   // check that the datatype has the correct extent
   MPI_Aint ext;
   MPI_Type_extent(avtWholeImageCompositerWithZ::mpiTypeZFPixel, &ext);
   if (ext != sizeof(onePixel))
   {
       MPI_Datatype tmp = avtWholeImageCompositerWithZ::mpiTypeZFPixel;
       MPI_Type_create_resized(tmp, 0, sizeof(ZFPixel_t),
           &avtWholeImageCompositerWithZ::mpiTypeZFPixel);
       MPI_Type_free(&tmp);
   }

   MPI_Type_commit(&avtWholeImageCompositerWithZ::mpiTypeZFPixel);

   MPI_Op_create((MPI_User_function *)MergeZFPixelBuffers, 1,
      &avtWholeImageCompositerWithZ::mpiOpMergeZFPixelBuffers);
}
Example #15
0
/**
 * Returns the MPI_Datatype for `MyStruct`.
 *
 * TODO: You have to implement this function here:
 */
MPI_Datatype mystruct_get_mpi_type() {
    // use MPI commands to create a custom data type for MyStruct
    MPI_Datatype type, tmp_type;

    // TODO: create the MPI datatype for MyStruct
    MyStruct x;
    MPI_Aint base, adr_key, adr_d, adr_e;
    MPI_Get_address(&x, &base);
    MPI_Get_address(&x.key, &adr_key);
    MPI_Get_address(&x.d, &adr_d);
    MPI_Get_address(&x.e[0], &adr_e);

    MPI_Aint     disps[3] = {adr_key - base, adr_d - base, adr_e - base};
    MPI_Aint     extent   = sizeof(x);
    int          blens[3] = {1, 1, 4};
    MPI_Datatype types[3] = {MPI_UNSIGNED, MPI_DOUBLE, MPI_CHAR};

    MPI_Type_create_struct(3, blens, disps, types, &tmp_type);
    MPI_Type_create_resized(tmp_type, 0, extent, &type);

    MPI_Type_commit(&type);
    return type;
}
Example #16
0
int matrix_placement_proc(int nb_proc_row, int nb_in_block, MPI_Comm* comm, int* sendbuf, int* rcvbuf, enum arrangement type, int ldnblc){
  MPI_Datatype blocktype;
  MPI_Datatype blocktype2; 
  MPI_Datatype blocktype3;
  int ii, jj;


  // first you create the type representation for a matrix bloc associated to a process
  MPI_Type_vector(nb_in_block, nb_in_block, nb_in_block*nb_proc_row, MPI_INT, &blocktype2);
  MPI_Type_create_resized(blocktype2, 0, sizeof(int), &blocktype);
  MPI_Type_commit(&blocktype);

  MPI_Type_vector(nb_in_block, nb_in_block, ldnblc, MPI_INT, &blocktype3);
  MPI_Type_commit(&blocktype3);

  int disps[nb_proc_row*nb_proc_row];
  int counts[nb_proc_row*nb_proc_row];
  for (ii=0; ii<nb_proc_row; ii++) {
    for (jj=0; jj<nb_proc_row; jj++) {
      disps[ii*nb_proc_row+jj] = ii*nb_in_block*nb_in_block*nb_proc_row+jj*nb_in_block;
      counts [ii*nb_proc_row+jj] = 1;
    }
  }

  // scatter or gather
  if (type == SCATTER) 
    MPI_Scatterv(sendbuf, counts, disps, blocktype, rcvbuf, 1, blocktype3, 0, *comm);
  else if (type == GATHER) 
    MPI_Gatherv(sendbuf, 1, blocktype3, rcvbuf, counts, disps, blocktype, 0, *comm);
  
  MPI_Type_free(&blocktype);
  MPI_Type_free(&blocktype3);
  
  if (type != GATHER && type != SCATTER)
    return EXIT_FAILURE;
  return EXIT_SUCCESS;
}
Example #17
0
/*
 * Construct a sub array type for scatter and gather data
 * Reference: http://stackoverflow.com/questions/9269399/sending-blocks-of-2d-array-in-c-using-mpi/9271753#9271753
 */
void init_subarrtype(int root, int me,
        int n, int dim_sz, int per_n,
        MPI_Datatype* subarrtype_addr, int sendcounts[], int displs[]) {
    int sizes[2]    = {n, n};         /* global size */
    int subsizes[2] = {per_n, per_n}; /* local size */
    int starts[2]   = {0,0};          /* where this one starts */

    MPI_Datatype type;
    mpi_check(MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &type));
    mpi_check(MPI_Type_create_resized(type, 0, per_n*sizeof(double), subarrtype_addr));
    mpi_check(MPI_Type_commit(subarrtype_addr));
    int i,j;
    if(me == root) {
        for (i=0; i< dim_sz*dim_sz; i++) { sendcounts[i] = 1; }
        int disp = 0;
        for (i=0; i<dim_sz; i++) {
            for (j=0; j<dim_sz; j++) {
                displs[i*dim_sz+j] = disp;
                disp += 1;
            }
            disp += (per_n-1)*dim_sz;
        }
    }
}
Example #18
0
int main( int argc, char *argv[] )
{
    int errs = 0, i;
    int rank, size, source, dest;
    int count; 
    int *buf; 
    MPI_Comm      comm;
    MPI_Status    status;
    MPI_Datatype  newtype;

    MTest_Init( &argc, &argv );

    comm = MPI_COMM_WORLD;

    /* Determine the sender and receiver */
    MPI_Comm_rank( comm, &rank );
    MPI_Comm_size( comm, &size );
    source = 0;
    dest   = size - 1;

    /* Create an type that is "* INT * "
       that is, there is a int-sized pad at the beginning of the type, 
       and the extent is still 3 ints.  Note, however, that the INT
       is still at displacement 0, so the effective pattern i*/
    MPI_Type_create_resized( MPI_INT, -(int)sizeof(int), 3 * sizeof(int), &newtype ); 
    MPI_Type_commit( &newtype );
    for (count = 1; count < 65000; count = count * 2) {
	buf = (int *)malloc( count * 3 * sizeof(int) );
	if (!buf) {
	    MPI_Abort( comm, 1 );
            exit(1);
	}
	for (i=0; i<3*count; i++) buf[i] = -1;
	if (rank == source) {
	    for (i=0; i<count; i++) buf[3*i] = i;
	    MPI_Send( buf, count, newtype, dest, 0, comm );
	    MPI_Send( buf, count, newtype, dest, 1, comm );
	}
	else if (rank == dest) {
	    MPI_Recv( buf, count, MPI_INT, source, 0, comm, &status );
	    for (i=0; i<count; i++) {
		if (buf[i] != i) {
		    errs++;
		    if (errs < 10) {
			printf( "buf[%d] = %d\n", i, buf[i] );
		    }
		}
	    }
	    for (i=0; i<count*3; i++) buf[i] = -1;
	    MPI_Recv( buf, count, newtype, source, 1, comm, &status );
	    for (i=0; i<count; i++) {
		if (buf[3*i] != i) {
		    errs++;
		    if (errs < 10) {
			printf( "buf[3*%d] = %d\n", i, buf[i] );
		    }
		}
	    }
	}
    }
    MPI_Type_free( &newtype );

    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
Example #19
0
int main ( int argc, char *argv[] ) {

  // Solution arrays
  real *h_u; /* to be allocated in ROOT only */ 
  real *t_u;
  real *t_un;

  // Auxiliary variables
  int rank;
  int size;
  int step;
  dmn domain;
  double wtime;
  int nbrs[6];
  int i, j, k;

  // Initialize MPI
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  // if number of np != Sx*Sy*Sz then terminate. 
  if (size != SX*SY*SZ){
    if (rank==ROOT) 
      fprintf(stderr,"%s: Needs at least %d processors.\n", argv[0], SX*SY*SZ);
    MPI_Finalize();
    return 1;
  }

  // verify subsizes
  if (NX%SX!=0 || NY%SY!=0 || NZ%SZ!=0) {
    if (rank==ROOT) 
      fprintf(stderr,"%s: Subdomain sizes not an integer value.\n", argv[0]);
    MPI_Finalize();
    return 1;
  }

  // Build a 2D cartessian communicator
  MPI_Comm Comm3d;
  int ndim=3;
  int dim[3]={SZ,SY,SX}; // domain decomposition subdomains
  int period[3]={false,false,false}; // periodic conditions
  int reorder={true}; // allow reorder if necesary
  int coord[3];
  MPI_Cart_create(MPI_COMM_WORLD,ndim,dim,period,reorder,&Comm3d);
  MPI_Comm_rank(Comm3d,&rank); // rank wrt to Comm2d
  MPI_Cart_coords(Comm3d,rank,3,coord); // rank coordinates
  
  // Map the neighbours ranks
  MPI_Cart_shift(Comm3d,0,1,&nbrs[TOP],&nbrs[BOTTOM]);
  MPI_Cart_shift(Comm3d,1,1,&nbrs[NORTH],&nbrs[SOUTH]);
  MPI_Cart_shift(Comm3d,2,1,&nbrs[WEST],&nbrs[EAST]);

  // Manage Domain sizes
  domain = Manage_Domain(rank,size,coord,nbrs); 

  // Allocate Memory
  Manage_Memory(0,domain,&h_u,&t_u,&t_un);

  // Root mode: Build Initial Condition 
  if (domain.rank==ROOT) Call_IC(2,h_u);

  // Build MPI data types
  MPI_Datatype myGlobal;
  MPI_Datatype myLocal;
  MPI_Datatype xySlice;
  MPI_Datatype yzSlice;
  MPI_Datatype xzSlice;
  //Manage_DataTypes(0,domain,&xySlice,&yzSlice,&xzSlice,&myLocal,&myGlobal);

  // Build a MPI data type for a subarray in Root processor
  MPI_Datatype global;
  int nx = domain.nx;
  int ny = domain.ny;
  int nz = domain.nz;
  int bigsizes[3] = {NZ,NY,NX};
  int subsizes[3] = {nz,ny,nx};
  int starts[3] = {0,0,0};
  MPI_Type_create_subarray(3, bigsizes, subsizes, starts, MPI_ORDER_C, MPI_CUSTOM_REAL, &global);
  MPI_Type_create_resized(global, 0, nx*sizeof(real), &myGlobal); // extend the type 
  MPI_Type_commit(&myGlobal);
    
  // Build a MPI data type for a subarray in workers
  int bigsizes2[3] = {R+nz+R,R+ny+R,R+nx+R};
  int subsizes2[3] = {nz,ny,nx};
  int starts2[3] = {R,R,R};
  MPI_Type_create_subarray(3, bigsizes2, subsizes2, starts2, MPI_ORDER_C, MPI_CUSTOM_REAL, &myLocal);
  MPI_Type_commit(&myLocal); // now we can use this MPI costum data type

  // halo data types
  MPI_Datatype yVector;
  MPI_Type_vector( ny, nx, nx+2*R, MPI_CUSTOM_REAL, &xySlice); MPI_Type_commit(&xySlice);
  MPI_Type_vector( ny,  1, nx+2*R, MPI_CUSTOM_REAL, &yVector); 
  MPI_Type_create_hvector(nz, 1, (nx+2*R)*(ny+2*R)*sizeof(real), yVector, &yzSlice); MPI_Type_commit(&yzSlice);
  MPI_Type_vector( nz, nx, (nx+2*R)*(ny+2*R), MPI_CUSTOM_REAL, &xzSlice); MPI_Type_commit(&xzSlice);
  
  // build sendcounts and displacements in root processor
  int sendcounts[size], displs[size];
  if (rank==ROOT) {
    for (i=0; i<size; i++) sendcounts[i]=1;
    int disp = 0; // displacement counter
    for (k=0; k<SZ; k++) {
      for (j=0; j<SY; j++) {
	for (i=0; i<SX; i++) {
	  displs[i+SX*j+SX*SY*k]=disp;  disp+=1; // x-displacements
	}
	disp += SX*(ny-1); // y-displacements
      }
      disp += SX*NY*(nz-1); // z-displacements
    } 
  }

  // Scatter global array data and exchange halo regions
  MPI_Scatterv(h_u, sendcounts, displs, myGlobal, t_u, 1, myLocal, ROOT, Comm3d);
  Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_u); MPI_Barrier(Comm3d);
   
  // ROOT mode: Record the starting time.
  if (rank==ROOT) wtime=MPI_Wtime();

  // Asynchronous MPI Solver
  for (step = 0; step < NO_STEPS; step+=2) {
    // print iteration in ROOT mode
    if (rank==ROOT && step%10000==0) printf("  Step %d of %d\n",step,(int)NO_STEPS);
    
    // Exchange Boundaries and compute stencil
    Call_Laplace(domain,&t_u,&t_un);Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_un);//1stIter
    Call_Laplace(domain,&t_un,&t_u);Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_u );//2ndIter
  }
  
  // ROOT mode: Record the final time.
  if (rank==ROOT) {
    wtime = MPI_Wtime()-wtime; printf ("\n Wall clock elapsed = %f seconds\n\n", wtime );    
  }
  /*
  // CAREFUL: uncomment only for debugging. Print subroutine
  for (int p=0; p<size; p++) {
    if (rank == p) {
      printf("Local process on rank %d is:\n", rank);
      for (k=0; k<nz+2*R; k++) {
	printf("-- layer %d --\n",k);
	for (j=0; j<ny+2*R; j++) {
	  putchar('|');
	  for (i=0; i<nx+2*R; i++) printf("%3.0f ",t_u[i+(nx+2*R)*j+(nx+2*R)*(ny+2*R)*k]);
	  printf("|\n");
	}
	printf("\n");
      }
    }
    MPI_Barrier(Comm3d);
    }*/

  // gather all pieces into the big data array
  MPI_Gatherv(t_u, 1, myLocal, h_u, sendcounts, displs, myGlobal, ROOT, Comm3d);
 
  // save results to file
  //if (rank==0) Print(h_u,NX,NY,NZ);
  if (rank==ROOT) Save_Results(h_u); 

  // Free MPI types
  Manage_DataTypes(1,domain,&xySlice,&yzSlice,&xzSlice,&myLocal,&myGlobal);
  
  // Free Memory
  Manage_Memory(1,domain,&h_u,&t_u,&t_un); 
    
  // finalize MPI
  MPI_Finalize();

  // ROOT mode: Terminate.
  if (rank==ROOT) {
    printf ("HEAT_MPI:\n" );
    printf ("  Normal end of execution.\n\n" );
  }

  return 0;
}
Example #20
0
File: test14.c Project: caisan/umpi
int main(int argc, char **argv)
{
	if (MPI_Init(&argc, &argv) != MPI_SUCCESS) {
		fprintf(stderr, "MPI initialization failed.\n");
		return 1;
	}
	int rank, size;
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	if (size < 2) {
		fprintf(stderr, "cant play this game alone.\n");
		return 1;
	}
	struct {
		float _a[3];
		int _34[2];
		char _0;
		unsigned _6;
		int _b;
		unsigned _7;
		int _9;
		short _5;
		unsigned _8;
		double _c;
		float _12[2];
		char _d;
	} recv[size];
	memset(recv, 0, sizeof(recv));
	MPI_Datatype tmp, recv_type, send_type;
	int recv_blocklengths[] = { 1, 2, 2, 1, 1, 1, 1, 1 };
	MPI_Aint recv_displacements[] = {
		(char *)&recv->_0 - (char *)recv,
		(char *)recv->_12 - (char *)recv,
		(char *)recv->_34 - (char *)recv,
		(char *)&recv->_5 - (char *)recv,
		(char *)&recv->_6 - (char *)recv,
		(char *)&recv->_7 - (char *)recv,
		(char *)&recv->_8 - (char *)recv,
		(char *)&recv->_9 - (char *)recv
	};
	MPI_Datatype recv_types[] = { MPI_CHAR, MPI_FLOAT, MPI_INT, MPI_SHORT, MPI_UNSIGNED, MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT };
	MPI_Type_create_struct(8, recv_blocklengths, recv_displacements, recv_types, &tmp);
	MPI_Type_create_resized(tmp, 0, (char *)(recv+1) - (char *)recv, &recv_type);
	MPI_Type_free(&tmp);
	MPI_Type_commit(&recv_type);
	struct {
		char _0;
		float _12[2];
		int _3;
		float _a;
		int _4;
		short _5;
		char _b[5];
		unsigned _678[3];
		int _9;
		long _c;
	} send;
	send._0 = rank + 0;
	send._12[0] = rank + 1;
	send._12[1] = rank + 2;
	send._3 = rank + 3;
	send._4 = rank + 4;
	send._5 = rank + 5;
	send._678[0] = rank + 6;
	send._678[1] = rank + 7;
	send._678[2] = rank + 8;
	send._9 = rank + 9;
	int send_blocklengths[] = { 1, 2, 1, 1, 1, 3, 1 };
	MPI_Aint send_displacements[] = {
		(char *)&send._0 - (char *)&send,
		(char *)send._12 - (char *)&send,
		(char *)&send._3 - (char *)&send,
		(char *)&send._4 - (char *)&send,
		(char *)&send._5 - (char *)&send,
		(char *)send._678 - (char *)&send,
		(char *)&send._9 - (char *)&send
	};
	MPI_Datatype send_types[] = { MPI_CHAR, MPI_FLOAT, MPI_INT, MPI_INT, MPI_SHORT, MPI_UNSIGNED, MPI_INT };
	MPI_Type_create_struct(7, send_blocklengths, send_displacements, send_types, &tmp);
	MPI_Type_create_resized(tmp, 0, sizeof(send), &send_type);
	MPI_Type_free(&tmp);
	MPI_Type_commit(&send_type);
	if (MPI_Allgather(&send, 1, send_type, recv, 1, recv_type, MPI_COMM_WORLD)) {
		fprintf(stderr, "MPI_Allgather failed.\n");
		MPI_Abort(MPI_COMM_WORLD, 1);
	}
	for (int j = 0; j < size; j++) {
		MPI_Barrier(MPI_COMM_WORLD);
		if (j == rank) {
			fprintf(stderr, "[ %d ] received:", rank);
			for (int i = 0; i < size; i++)
				fprintf(stderr, " (%d %g %g %d %d %d %d %d %d %d)", recv[i]._0, recv[i]._12[0], recv[i]._12[1], recv[i]._34[0], recv[i]._34[1], recv[i]._5, recv[i]._6, recv[i]._7, recv[i]._8, recv[i]._9);
			fprintf(stderr, "\n");
		}
	}
	MPI_Finalize();
	return 0;
}
Example #21
0
int main(int argc, char **argv)
{
	int rank, size;		// My rank and total # of proc
	int row_rank, col_rank; // My row and column rank
	int coord[2];		// My coords in grid
	int dimension;		// #of dimensions
	int dim[2], period[2], reorder; //variables for grid creation
	int local_N, local_M; // local sizes
	double *Ax, *Bx; // local matrices
	double *Sl, *Sr, *Su, *Sd;
	double hx, hy, hz; // variables 
	double nev = 0.;
	int i_start, i_end, j_start, j_end;
	int iter = 0; // iteration #
	hx = hy = hz = 1;
	i_start = j_start = 0;
	Sr = Sl = Su = Sd = NULL;
	MPI_Comm cart_comm;	// Grid comm
	MPI_Comm col_comm;	// My column comm
	MPI_Comm row_comm;  // My row comm
	MPI_Status status;
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);


	// Forming a grid
	switch (size)
	{
	case 1: // 1x1
		dim[0] = 1; dim[1] = 1;
		dimension = 2;
		break;
	case 2: // 1x2
		dim[0] = 1; dim[1] = 2;
		dimension = 2;
		break;
	case 4: // 2x2 or 4x1 or 1x4
		#ifdef SQUARE
		dim[0] = 2; dim[1] = 2;
		dimension = 2;
		#endif // SQUARE

		#ifndef SQUARE
		dim[0] = 1; dim[1] = 4;
		dimension = 2;
		#endif // ROW	
		break;
	case 8:
		#ifdef SQUARE
		dim[0] = 2; dim[1] = 4;
		dimension = 2;
		#endif // SQUARE

		#ifndef SQUARE
		dim[0] = 1; dim[1] = 8;
		dimension = 2;
		#endif // ROW	
		break;

	case 9:
		#ifdef SQUARE
		dim[0] = 3; dim[1] = 3;
		dimension = 2;
		#endif // SQUARE
		break;
	default:
		printf("Please run with 1, 2, 4 or 8 processes.\n"); fflush(stdout);
		MPI_Abort(MPI_COMM_WORLD, 1);
		break;
	}

	local_N = i_end = N / dim[0];
	local_M = j_end = K / dim[1];

	// No wrap around
	period[0] = 0; period[1] = 0;
	// Reordering ranks in grid comm
	reorder = 1;

	MPI_Cart_create(MPI_COMM_WORLD, dimension, dim, period, reorder, &cart_comm);

	// Get new rank and coords in new cartesian comm 
	MPI_Comm_rank(cart_comm, &rank);
	MPI_Cart_coords(cart_comm, rank, dimension, coord);

	// Create comms for rows and columns
	int var_coord[2];
	// Column comm
	var_coord[0] = 1; var_coord[1] = 0;
	MPI_Cart_sub(cart_comm, var_coord, &col_comm);
	MPI_Comm_rank(col_comm, &col_rank);
	//Row comm
	var_coord[0] = 0; var_coord[1] = 1;
	MPI_Cart_sub(cart_comm, var_coord, &row_comm);
	MPI_Comm_rank(row_comm, &row_rank);

	if (coord[0] == 0 || coord[0] == (dim[0] - 1))
		local_N += 1;
	else
		local_N += 2;

	if (coord[1] == 0 || coord[1] == (dim[1] - 1))
		local_M += 1;
	else
		local_M += 2;

	#ifdef DEBUG
	for (int i = 0; i < size; i++)
	{
		if (rank == i)
			printf("Rank = %d , Col_rank = %d, Row_rank = %d, coordinates are %d %d local N = %d  local M = %d \n ", rank, col_rank, row_rank, coord[0], coord[1], local_N, local_M); fflush(stdout);

	}
	MPI_Barrier(cart_comm);
	#endif // DEBUG

	//Init local A and B
	Ax = new double[local_N*local_M*K];
	Bx = new double[local_N*local_M*K];


	for (int k = 0; k < K; k++)
	{
		for (int i = 0; i < local_N; i++)
		{
			for (int j = 0; j < local_M; j++)
			{
				Ax[local_N*local_M*k + local_M*i + j] = 1;
				Bx[local_N*local_M*k + local_M*i + j] = 1;
			}
		}
	}

	// Create types for shadows
	MPI_Datatype Type_H, Type_lr, Type_ud;//Type_right,Type_down,Type_up;

	MPI_Type_vector(local_N*K, 1, local_M, MPI_DOUBLE, &Type_H);
	MPI_Type_create_resized(Type_H, 0, sizeof(MPI_DOUBLE) * 2, &Type_lr);
	MPI_Type_commit(&Type_lr);

	MPI_Type_vector(K, local_M, local_M*local_N, MPI_DOUBLE, &Type_H);
	MPI_Type_create_resized(Type_H, 0, local_M*sizeof(MPI_DOUBLE) * 2, &Type_ud);
	MPI_Type_commit(&Type_ud);

	//Need to start iterations
	double fx, fy, fz;
	fx = fy = fz = 0.;
	int ijk = 0;

	double start, finish;
	start = MPI_Wtime();

#ifdef EASY
	while (iter < N_ITER)
	{
		for (int k = 1; k < K - 1; k++)
		{
			for (int i = 1; i < local_N - 1; i++)
			{
				for (int j = 1; j < local_M - 1; j++)
				{
					fx = (Ax[local_N*local_M*k + local_M*(i + 1) + j] + Ax[local_N*local_M*k + local_M*(i - 1) + j]) / (hx*hx);
					fy = (Ax[local_N*local_M*k + local_M*i + j + 1] + Ax[local_N*local_M*k + local_M*i + j - 1]) / (hy*hy);
					fz = (Ax[local_N*local_M*(k + 1) + local_M*i + j] + Ax[local_N*local_M*(k - 1) + local_M*i + j]) / (hz*hz);
					Bx[local_N*local_M*k + local_M*i + j] = (fx + fy + fz) / (2 / (hx*hx) + 2 / (hy*hy) + 2 / (hz*hz));
					// Need to comp nev
				}
			}
		}
		for (int k = 1; k < K - 1; k++)
		{
			for (int i = 1; i < local_N - 1; i++)
			{
				for (int j = 1; j < local_M - 1; j++)
				{
					ijk = local_N*local_M*k + local_M*i + j;
					Ax[ijk] = Bx[ijk];
				}
			}
		}
		// Sending and recv slice in row to row + 1
		if (row_rank != dim[1] - 1)
			MPI_Sendrecv(&Bx[local_M - 2], 1, Type_lr, row_rank + 1, 0, &Ax[local_M - 1], 1, Type_lr, row_rank + 1, 0, row_comm, &status);
		
		// Sending and recv slice in row to row - 1
		if (row_rank != 0)
			MPI_Sendrecv(&Bx[1], 1, Type_lr, row_rank - 1, 0, Ax, 1, Type_lr, row_rank - 1, 0, row_comm, &status);
		
		// Sending and recv slice in column to column + 1
		if (col_rank != dim[0] - 1)
			MPI_Sendrecv(&Bx[(local_N - 2)*local_M], 1, Type_ud, col_rank + 1, 0, &Ax[(local_N - 1)*local_M], 1, Type_ud, col_rank + 1, 0, col_comm, &status);

		// Sending and recv slice in column to column - 1
		if (col_rank != 0)
			MPI_Sendrecv(&Bx[local_M], 1, Type_ud, col_rank - 1, 0, Ax, 1, Type_ud, col_rank - 1, 0, col_comm, &status);

		iter++;
	}
				#endif

#ifndef EASY
	while (iter < N_ITER)
	{
		// recv from i-1 and j-1
		if (coord[0] != 0 || coord[1] != 0)
		{ 
			if (col_rank > 0)
			{ 
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Waiting slice from COLUMN - 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Recv(Ax, 1, Type_ud, col_rank - 1, 0, col_comm, &status);
				if (row_rank > 0)
				{
				#ifdef DEBUG
					printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

					MPI_Recv(Ax, 1, Type_lr, row_rank - 1, 0, row_comm, &status);
				}
			}
			else
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Recv(Ax, 1, Type_lr, row_rank - 1, 0, row_comm, &status);
			}
		}

				#ifdef DEBUG
		printf("Rank %d coordinates are %d %d ------- COMPUTING \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

		// computing
		for (int k = 1; k < K - 1; k++)
		{
			for (int i = 1; i < local_N - 1; i++)
			{
				for (int j = 1; j < local_M - 1; j++)
				{
					fx = (Ax[local_N*local_M*k + local_M*(i + 1) + j] + Ax[local_N*local_M*k + local_M*(i - 1) + j]) / (hx*hx);
					fy = (Ax[local_N*local_M*k + local_M*i + j + 1] + Ax[local_N*local_M*k + local_M*i + j - 1]) / (hy*hy);
					fz = (Ax[local_N*local_M*(k + 1) + local_M*i + j] + Ax[local_N*local_M*(k - 1) + local_M*i + j]) / (hz*hz);
					Ax[local_N*local_M*k + local_M*i + j] = (fx + fy + fz) / (2 / (hx*hx) + 2 / (hy*hy) + 2 / (hz*hz));
					// Need to comp nev
				}
			}
		}

		// send to i-1 j-1
		if (coord[0] != 0 || coord[1] != 0)
		{ 
			if (col_rank > 0)
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Sending slice to COLUMN - 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Send(&Ax[local_M], 1, Type_ud, col_rank - 1, 0, col_comm);
				if (row_rank > 0)
				{
				#ifdef DEBUG
					printf("Rank %d coordinates are %d %d ------- Sending slice to ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

					MPI_Send(&Ax[1], 1, Type_lr, row_rank - 1, 0, row_comm);
				}
			}
			else
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Sending slice to ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Send(&Ax[1], 1, Type_lr, row_rank - 1, 0, row_comm);
			}
		}


		
		//send and recv from i+1 j+1
		if (coord[0] != dim[0] - 1 || coord[1] != dim[1] - 1)
		{ 
			// Send
			if (col_rank < dim[0] - 1)
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Sending slice to COLUMN + 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Send(&Ax[(local_N - 2)*local_M], 1, Type_ud, col_rank + 1, 0, col_comm);
				if (row_rank < dim[1] - 1)
				{
					#ifdef DEBUG
					printf("Rank %d coordinates are %d %d ------- Sending slice to ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
					#endif // DEBUG

					MPI_Send(&Ax[local_M - 2], 1, Type_lr, row_rank + 1, 0, row_comm);
				}
			}
			else
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Sending slice to ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Send(&Ax[local_M - 2], 1, Type_lr, row_rank + 1, 0, row_comm);
			}
			// Recv
			if (col_rank < dim[0] - 1)
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Waiting slice from COLUMN + 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Recv(&Ax[(local_N - 1)*local_M], 1, Type_ud, col_rank + 1, 0, col_comm, &status);
				if (row_rank < dim[1] - 1)
				{
				#ifdef DEBUG
					printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

					MPI_Recv(&Ax[local_M - 1], 1, Type_lr, row_rank + 1, 0, row_comm, &status);
				}
			}
			else
			{
				#ifdef DEBUG
				printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout);
				#endif // DEBUG

				MPI_Recv(&Ax[local_M - 1], 1, Type_lr, row_rank + 1, 0, row_comm, &status);
			}
		}	
		//if (rank == 0)
			//printf("Iter %d done \n ",iter); fflush(stdout);
		iter++;
	}
				#endif

	finish = MPI_Wtime();

	double loc_comp_time = finish - start;
	double max_time;
	MPI_Allreduce(&loc_comp_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);

	if (rank == 0)
		printf("NxMxK = %dx%dx%d \nGrid - %dx%d \nLongest time %g\n ",N,M,K,dim[0],dim[1], max_time); fflush(stdout);
	


	MPI_Finalize();
	return 0;
}
Example #22
0
GrayScott::GrayScott(int N, double rmin, double rmax, double dt, double Du, double Dv, double F, double k, int nSteps, std::string pngname, world_info w, bool localtranspose, unsigned int nthreads)
    : N_(N)
    , Ntot_(N*N)
//    , L_(L)
    , dx_((double) (rmax-rmin) / (double) (N-1))
    , dt_(dt)//(dx_*dx_ / (2.*std::max(Du,Dv)))
    , nSteps_(nSteps)
    , currStep_(0)
    , Du_(Du)
    , Dv_(Dv)
    , uCoeff(Du_*dt_/(2.*dx_*dx_))
    , vCoeff(Dv_*dt_/(2.*dx_*dx_))
    , F_(F)
    , k_(k)
    , matU1_(N, -Du*dt/(2.*dx_*dx_), 1.+Du*dt/(dx_*dx_), -Du*dt/(2.*dx_*dx_))
//    , matU2_(N, -Du*dt/(2.*dx_*dx_), 1.+Du*dt/(dx_*dx_), -Du*dt/(2.*dx_*dx_)) // equal to matU1_, since we have a square grid (dx==dy)
    , matV1_(N, -Dv*dt/(2.*dx_*dx_), 1.+Dv*dt/(dx_*dx_), -Dv*dt/(2.*dx_*dx_))
//    , matV2_(N, -Dv*dt/(2.*dx_*dx_), 1.+Dv*dt/(dx_*dx_), -Dv*dt/(2.*dx_*dx_))
    , pngName_(pngname)
    , world(w)
    , rmin_(rmin)
    , rmax_(rmax)
    , localtranspose_(localtranspose)
    , nthreads_(nthreads)
{
    if (world.rank == 0) {
        // create directory to save output to
        time_t rawtime;
        struct tm * timeinfo;
        char buffer[80];
        time (&rawtime);
        timeinfo = localtime(&rawtime);
        strftime(buffer,80,"%d-%m-%Y_%H-%M-%S",timeinfo);
        std::string timeString(buffer);
	
        dirPath_ = "data/" + timeString + "/";
	
        boost::filesystem::path dir(dirPath_);
//        boost::filesystem::create_directory(dir);
    }
    
    
    
    
    // global grid
    Nx_glo = N;
    Ny_glo = N;
    NN_glo = Nx_glo * Ny_glo;
    
    // local grid
    Nx_loc = Nx_glo / world.dims_x;
    Ny_loc = Ny_glo / world.dims_y;
    NN_loc = Nx_loc * Ny_loc;
    
    Nb_loc = Ny_loc/Nx_loc;
    
    
    
    // build process geometry with cartesian communicator
    int periods[2] = {false, false};
    int dims[2] = {world.dims_x, world.dims_y};
    
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, true, &cart_comm);
    
    MPI_Comm_rank(cart_comm, &world.cart_rank);
    
    MPI_Cart_shift(cart_comm, 0, 1, &world.top_proc, &world.bottom_proc);
    
    int coords[2];
    MPI_Cart_coords(cart_comm, world.cart_rank, 2, coords);
    
    world.coord_x = coords[0];
    world.coord_y = coords[1];
    
    
    
    // datatypes
    
    // build contiguous (rows) vectors for boundaries.
    // each process has multiple rows in the grid
    MPI_Type_contiguous(Ny_loc, MPI_DOUBLE, &bottom_boundary);
    MPI_Type_commit(&bottom_boundary);
    
    MPI_Type_contiguous(Ny_loc, MPI_DOUBLE, &top_boundary);
    MPI_Type_commit(&top_boundary);
    
    
    // build datatypes for transpose
    MPI_Datatype block_send, block_col, block_recv;
    
    // send datatype
    int sizes[2]    = {Nx_loc, Ny_loc}; // size of global array
    int subsizes[2] = {Nx_loc, Nx_loc}; // size of sub-region (square)
    int starts[2]   = {0,0};            // where does the first subarray begin (which index)
    
    MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &block_send);
    MPI_Type_commit(&block_send);

    // resize -> make contiguous
    MPI_Type_create_resized(block_send, 0, Nx_loc*sizeof(double), &block_resized_send);
    MPI_Type_free(&block_send);
    MPI_Type_commit(&block_resized_send);
    
    
    // receive datatype
    MPI_Type_vector(Nx_loc, 1, Ny_loc, MPI_DOUBLE, &block_col);
    MPI_Type_commit(&block_col);
    MPI_Type_hvector(Nx_loc, 1, sizeof(double), block_col, &block_recv);
    MPI_Type_free(&block_col);
    MPI_Type_commit(&block_recv);

    // resize data structure, so that it is contigious (for alltoall)
    MPI_Type_create_resized(block_recv, 0, 1*sizeof(double), &block_resized_recv);
    MPI_Type_free(&block_recv);
    MPI_Type_commit(&block_resized_recv);
    
    
    
    // sub-domain boundaries
    xmin_loc = rmin + world.coord_x * Nx_loc * dx_;
    xmax_loc = xmin_loc + (Nx_loc - 1) * dx_;
    ymin_loc = rmin + world.coord_y * Ny_loc * dx_;
    ymax_loc = ymin_loc + (Ny_loc - 1) * dx_;
    
    
    
    initialize_fields();
    
    MPI_Barrier(MPI_COMM_WORLD);
}
Example #23
0
void convolution(int my_id, int p){
    int i, j, k;
    int chunkSize;
    double start, end;
    double time[14];

    /* Input data */
    float input_1[N][N], input_2[N][N];
    /* Output data */
    float output[N][N];
    /* Set the chunk size for each processor */
    chunkSize = N/p;

    /* Two arrays storing the local data distributed by rank 0 */
    float local_data1[N][N], local_data2[N][N];
    /* Local matrix for matrix multiplication */
    float local_data3[chunkSize][N];
    /* A complex array storing the temp row to operate FFT */
    complex temp_data[N];

    /* Initialization of the original Matrix and distribution of data */
    if(my_id == 0){
        printf("2D convolution using SPMD model and MPI Collective operations\n");
        start = MPI_Wtime();
        /*Read data from the files*/
        readFile(input_1, input_2);

        time[0] = MPI_Wtime();
        printf("Reading file takes %f s.\n", time[0] - start);
    }

    /* Scatter all the data to local data */
    MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT,
                local_data1, chunkSize*N, MPI_FLOAT,
                0, MPI_COMM_WORLD);
    MPI_Scatter(input_2, chunkSize*N, MPI_FLOAT,
                local_data2, chunkSize*N, MPI_FLOAT,
                0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);
    /* Compute time for distributing data */
    if(my_id == 0){
        time[1] = MPI_Wtime();
        printf("Scattering data of rows to each processor takes %f s.\n", time[1] - time[0]);
    }

    /* Row FFT */
    for(i = 0; i < chunkSize; i++){
        for(j = 0; j < N; j++){
            /* FFT each row for im1 */
            temp_data[j].r = local_data1[i][j];
            temp_data[j].i = 0;
        }

        c_fft1d(temp_data, N, -1);

        for(j = 0; j < N; j++)
            local_data1[i][j] = temp_data[j].r;

        for(j = 0; j < N; j++){
            /* FFT each row for im2 */
            temp_data[j].r = local_data2[i][j];
            temp_data[j].i = 0;
        }

        c_fft1d(temp_data, N, -1);

        for(j = 0; j < N; j++)
            local_data2[i][j] = temp_data[j].r;
    }

    /* Gather all the data and distribute in columns */
    if(my_id == 0){
        time[2] = MPI_Wtime();
        printf("FFT each row for input im1 and im2 takes %f s.\n", time[2] - time[1]);
    }

    MPI_Gather(local_data1, chunkSize*N, MPI_FLOAT,
               input_1, chunkSize*N, MPI_FLOAT,
               0, MPI_COMM_WORLD);
    MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT,
               input_2, chunkSize*N, MPI_FLOAT,
               0, MPI_COMM_WORLD);

    if(my_id == 0){
        time[3] = MPI_Wtime();
        printf("Gathering all the data from different rows takes %f s.\n", time[3] - time[2]);
    }

    /* Initialize a new vector for distributing columns */
    MPI_Datatype column, col;
    /* Column vector */
    MPI_Type_vector(N, 1, N, MPI_FLOAT, &col);
    MPI_Type_commit(&col);
    MPI_Type_create_resized(col, 0, 1*sizeof(float), &column);
    MPI_Type_commit(&column);

    /* Scatter all the data to column local data */
    MPI_Scatter(input_1, chunkSize, column,
                local_data1, chunkSize, column,
                0, MPI_COMM_WORLD);
    MPI_Scatter(input_2, chunkSize, column,
                local_data2, chunkSize, column,
                0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);
    if(my_id == 0){
        time[4] = MPI_Wtime();
        printf("Scattering data of columns to each processor takes %f s.\n", time[4] - time[3]);
    }
    /* Column FFT */
    for(i = 0; i < chunkSize; i++){
        for(j = 0; j < N; j++){
            /* FFT each column for im1 */
            temp_data[j].r = local_data1[j][i];
            temp_data[j].i = 0;
        }

        c_fft1d(temp_data, N, -1);

        for(j = 0; j < N; j++)
            local_data1[j][i] = temp_data[j].r;

        for(j = 0; j < N; j++){
            /* FFT each column for im2 */
            temp_data[j].r = local_data2[j][i];
            temp_data[j].i = 0;
        }

        c_fft1d(temp_data, N, -1);

        for(j = 0; j < N; j++)
            local_data2[j][i] = temp_data[j].r;
    }
    /* Gather all the columns from each rank */
    if(my_id == 0){
        time[5] = MPI_Wtime();
        printf("FFT each column for input im1 and im2 takes %f s.\n", time[5] - time[4]);
    }

    MPI_Gather(local_data1, chunkSize, column,
               input_1, chunkSize, column,
               0, MPI_COMM_WORLD);
    MPI_Gather(local_data2, chunkSize, column,
               input_2, chunkSize, column,
               0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);

    /* Compute time and distribute data to do matrix multiplication */
    if(my_id == 0){
        time[6] = MPI_Wtime();
        printf("Gathering all the data from different columns takes %f s.\n", time[6] - time[5]);
    }

    MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT,
                local_data1, chunkSize*N, MPI_FLOAT,
                0, MPI_COMM_WORLD);
    /* Broadcast data2 to all the ranks */
    MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);
    if(my_id == 0){
        time[7] = MPI_Wtime();
        printf("Scattering data for multiplication takes %f s.\n", time[7] - time[6]);
    }

    /* Matrix multiplication */
    for(i = 0; i < chunkSize; i++)
        for(j = 0; j < N; j++)
            for(k = 0; k < N; k++)
                local_data3[i][j] += local_data1[i][k]*input_2[k][j];

    /* Collect multiplication results from each rank */
    if(my_id == 0){
        time[8] = MPI_Wtime();
        printf("Matrix multiplication takes %f s.\n", time[8] - time[7]);
    }

    /* Inverse-2DFFT(row) for the output file */
    for(i = 0; i < chunkSize; i++){
        for(j = 0; j < N; j++){
            /* FFT each row for im1 */
            temp_data[j].r = local_data3[i][j];
            temp_data[j].i = 0;
        }

        c_fft1d(temp_data, N, 1);

        for(j = 0; j < N; j++)
            local_data3[i][j] = temp_data[j].r;
    }

    if(my_id == 0){
        time[9] = MPI_Wtime();
        printf("Inverse-2DFFT for out_1(row) takes %f s.\n", time[9] - time[8]);
    }

    MPI_Gather(local_data3, chunkSize*N, MPI_FLOAT,
               output, chunkSize*N, MPI_FLOAT,
               0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);

    if(my_id == 0){
        time[10] = MPI_Wtime();
        printf("Gathering all the data of Inverse-2DFFT for out_1(row) takes %f s.\n", time[10] - time[9]);
    }

    MPI_Scatter(output, chunkSize, column,
                local_data1, chunkSize, column,
                0, MPI_COMM_WORLD);

    if(my_id == 0){
        time[11] = MPI_Wtime();
        printf("Scattering out_1(column) to each processor takes %f s.\n", time[11] - time[10]);
    }

    /* Inverse-2DFFT(column) for the output file */
    for(i = 0; i < chunkSize; i++){
        for(j = 0; j < N; j++){
            /* FFT each column for im1 */
            temp_data[j].r = local_data1[j][i];
            temp_data[j].i = 0;
        }

        c_fft1d(temp_data, N, 1);

        for(j = 0; j < N; j++)
            local_data1[j][i] = temp_data[j].r;
    }

    /* Gathering all the columns of the output file from each rank */
    if(my_id == 0){
        time[12] = MPI_Wtime();
        printf("Inverse-2DFFT out_1(column) takes %f s.\n", time[12] - time[11]);
    }

    MPI_Gather(local_data1, chunkSize, column,
               output, chunkSize, column,
               0, MPI_COMM_WORLD);

    if(my_id == 0){
        time[13] = MPI_Wtime();
        printf("Gathering all the data of the output file(column) takes %f s.\n", time[13] - time[12]);

        writeFile(output);

        end = MPI_Wtime();
        printf("Writing the output file to file takes %f s.\n", end - time[13]);

        printf("Total communication time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[13] - time[12] + time[11] - time[10] + time[7] - time[5] + time[4] - time[2] + time[1] - time[0]);
		printf("Total computing time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[12] - time[11] + time[10] - time[7] + time[5] - time[4] + time[2] - time[1]);
		printf("Total running time without loading/writing of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[13] - time[0]);
		printf("Total running time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", end - start);
    }

    /* Free vector column */
    MPI_Type_free(&column);
    MPI_Type_free(&col);
}
Example #24
0
int main(int argc, char **argv) {

    MPI_Init(&argc, &argv);
    int p, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &p);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);    

    int a[ROWS_A*COLS_A];
    int B[ROWS_B*COLS_B];
    int C[ROWS_C*COLS_C];

    const int BLOCKROWS = ROWS_C;  /* number of rows in _block_ */
    const int BLOCKCOLS = COLS_C/p; /* number of cols in _block_ */ /* Puede provocar que no se realice algún cálculo. Hay dos formas de enfocarlo. Que uno calcule más,o recalcular con un proceso menos para que sea par */
    const int NPROWS=1;  /* number of rows in _decomposition_ */ /* por nuestro enfoque, cada proceso devuelve una fila */
    const int NPCOLS=p;  /* number of cols in _decomposition_ */ /* por nuestro enfoque, cada proceso devuelve columnas igual al número de procesos */

    if (rank == 0) {
    	/* Relleno */
        for (int ii=0; ii<ROWS_A*COLS_A; ii++) {
            a[ii] = ii;
        }

        for (int ii=0; ii<ROWS_B*COLS_B; ii++) {
            b[ii] = ii;
        }
    }

	/* Controla que la multiplicación de matrices cumple la regla  */
    if (COLS_A != ROWS_B) {
        fprintf(stderr,"Error: number of array dimension %d != %d ", COLS_A, ROWS_B);
        MPI_Finalize();
        exit(-1);
    }

    /* Controla que el numero de procesos coincide con los datos a procesar */
    if (p != NPROWS*NPCOLS) {
        fprintf(stderr,"Error: number of PEs %d != %d x %d\n", p, NPROWS, NPCOLS);
        MPI_Finalize();
        exit(-1);
    }

    /* Para cada proceso, donde guarda los datos recibidos */
    int r[BLOCKROWS*BLOCKCOLS]; //3 x ( 3 / p )
    for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++) r[ii] = 0; //inicialización vector que usa cada proceso

    MPI_Datatype blocktype;
    MPI_Datatype blocktype2;

    //Primer vector A
    MPI_Type_vector(
    	COLS_A, //Número de elementos, que corresponden con el tamaño de la fila
    	ROWS_A/p, //tamaño de cada uno
    	COLS_A, //offset, desplazamiento con el siguiente
    	MPI_INT, 
    	&blocktype2);

    MPI_Type_create_resized( //Por ahora sin explicación, ya lo dirá Daniel
    	blocktype2, 
    	0, 
    	sizeof(int), 
    	&blocktype);

    MPI_Type_commit(&blocktype);

    int disps[NPROWS*NPCOLS]; //Los organizamos en una sola fila
    int counts[NPROWS*NPCOLS]; 
    for (int ii=0; ii<NPROWS; ii++) {
        for (int jj=0; jj<NPCOLS; jj++) {
            disps[ii*NPCOLS+jj] = ii*COLS*BLOCKROWS+jj*BLOCKCOLS; //Mismo q en la transparencia, todos del mismo tamaño
            counts [ii*NPCOLS+jj] = 1; //Esto puede complicarse, si se incluye condiciones de borde en algunos procesos
        }
    }

    MPI_Scatterv(
    	a, //Matriz origen
    	counts,  //numero de elementos de cada proceso
    	disps, //Desplazamientos
    	blocktype, 
    	r, //Donde se acumulan los datos recibidos
    	BLOCKROWS*BLOCKCOLS, //Tamaño del vector b
    	MPI_INT, 
    	0, //proceso root
    	MPI_COMM_WORLD);

    /* each proc prints it's "b" out, in order */ //Esto se hace así para que sea en orden, solo lo hace cada proceso si le toca. En procedimientos no ordenados, no hace falta
    for (int proc=0; proc<p; proc++) {
        if (proc == rank) {
            printf("Rank = %d\n", rank);
            if (rank == 0) {
                printf("Global matrix: \n");
                for (int ii=0; ii<ROWS; ii++) {
                    for (int jj=0; jj<COLS; jj++) {
                        printf("%3d ",(int)a[ii*COLS+jj]);
                    }
                    printf("\n");
                }
            }
            printf("Local Matrix:\n");
            for (int ii=0; ii<BLOCKROWS; ii++) {
                for (int jj=0; jj<BLOCKCOLS; jj++) {
                    printf("%3d ",(int)b[ii*BLOCKCOLS+jj]);
                }
                printf("\n");
            }
            printf("\n");
        }
        MPI_Barrier(MPI_COMM_WORLD); //todos los procesos deben llegar hasta aqui para poder continuar
    }

    if (rank == 0) {
		memset(a, 0, ROWS*COLS); //Borra la matriz original a 0
        printf("Global matrix again: \n");
                for (int ii=0; ii<ROWS; ii++) {
                    for (int jj=0; jj<COLS; jj++) {
                        printf("%3d ",(int)a[ii*COLS+jj]);
                    }
                    printf("\n");
                }
         
    }

    MPI_Gatherv(b, BLOCKROWS*BLOCKCOLS, MPI_CHAR, a, counts, disps, blocktype, 0, MPI_COMM_WORLD);

        if (rank == 0) {
                printf("Global matrix again: \n");
                for (int ii=0; ii<ROWS; ii++) {
                    for (int jj=0; jj<COLS; jj++) {
                        printf("%3d ",(int)a[ii*COLS+jj]);
                    }
                    printf("\n");
                }
            }
     

    MPI_Finalize();

    return 0;
}
Example #25
0
int main(int argc, char *argv[])
{
	//initialize
	int i, j, k, l, mpi_rank, mpi_size, mpi_rowsize, mpi_colsize, subN, sqrtP;
	int row_rank, col_rank, row, col, destR, destC, src, srcR, srcC;
	// declare variables to store time of parallelism
	double execTime, execStart, execEnd;

	MPI_Comm rowComm, colComm;
	
	MPI_Init(&argc, &argv);

	MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);

	if (argc < 2) {
		printf("error: need one argument for filename");
		exit(-1);
	}

	// declare variables of type vector to read matrices
	vector * mat1;
	vector * mat2;
	float * vector1;
	float * vector2;
	//create data on root
	// read matrices 
	if (mpi_rank == ROOT)
	{
		mat1 = readfile(argv[1], N, N);
		mat2 = readfile(argv[2], N, N);
		vector1 = mat1->data;
		vector2 = mat2->data;
	}
	// find the sub matrix dimention
	sqrtP = (int) sqrt(mpi_size);
	subN = N/sqrtP;

	//allocate memory for N/4xN rows, and NxN/4 columns
	float * row_mat, *col_mat, *row_mat_rec, *col_mat_rec, *col_matT;
	double *can_res, *can_out;

	//allocate memory for the buffers
	row_mat = allocateFloatMatrix(subN, subN);
	row_mat_rec = allocateFloatMatrix(subN, subN);
	col_mat = allocateFloatMatrix(subN, subN);
	col_matT = allocateFloatMatrix(subN, subN);
	col_mat_rec = allocateFloatMatrix(subN, subN);
	can_res = allocateDoubleMatrix(subN, subN);
	can_out = allocateDoubleMatrix(N, N);

	//create and commit datatypes
	MPI_Datatype arrtype, resized_arrtype, arrtypeD, resized_arrtypeD;

	int sizes[2] = { N,N };
	int subsizes[2] = { subN,subN };
	int starts[2] = { 0,0 };

	MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_FLOAT, &arrtype);
	MPI_Type_create_resized(arrtype, 0, subN * sizeof(float), &resized_arrtype);
	MPI_Type_commit(&resized_arrtype);
	
	MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &arrtypeD);
	MPI_Type_create_resized(arrtypeD, 0, subN * sizeof(double), &resized_arrtypeD);
	MPI_Type_commit(&resized_arrtypeD);

	//calculate send counts and displacements
	int * counts, * displs;
	counts = (int *) malloc(mpi_size * sizeof(int));
	displs = (int *) malloc(mpi_size * sizeof(int));

	for(i = 0; i < mpi_size; i++)
	{		
		counts[i] = 1;
		displs[i] = N*(i/sqrtP) + (i%sqrtP);
	}	
	
	//start timer, compute dot product and record execution time 
	execStart = MPI_Wtime();
	
	//scatterv subarrays
	MPI_Scatterv(vector1, counts, displs, resized_arrtype, row_mat, subN*subN, MPI_FLOAT, ROOT, MPI_COMM_WORLD);
	MPI_Scatterv(vector2, counts, displs, resized_arrtype, col_mat, subN*subN, MPI_FLOAT, ROOT, MPI_COMM_WORLD);
	
	//get row comm and rank
	row = mpi_rank/sqrtP;
	MPI_Comm_split(MPI_COMM_WORLD,row,mpi_rank,&rowComm);
	MPI_Comm_rank(rowComm,&row_rank);
	MPI_Comm_size(rowComm, &mpi_rowsize);
	//get col comm and rank
	col = mpi_rank%sqrtP;
	MPI_Comm_split(MPI_COMM_WORLD,col,mpi_rank,&colComm);
	MPI_Comm_rank(colComm,&col_rank);
	MPI_Comm_size(colComm, &mpi_colsize);
	
	//MPI_Barrier(MPI_COMM_WORLD);
	
	// find the source and destination in row communicator - to left shift rows by row number
	destR = row_rank-row;
    if (destR < 0) {
        destR = destR + mpi_rowsize;
    }
	
	srcR = row_rank+row;
	if (srcR > (mpi_rowsize-1)) {
        srcR = srcR - mpi_rowsize;
    }
	
	// find the source and destination in column communicator  - to north shift columns by column number
	destC = col_rank - col;
    if (destC < 0) {
        destC = destC + mpi_colsize;
    }
	srcC = col_rank+col;
	if (srcC > (mpi_colsize-1)) {
        srcC = srcC - mpi_colsize;
    }
	// left shift rows by row number
	MPI_Sendrecv(row_mat, subN*subN, MPI_FLOAT, destR, 0, row_mat_rec, subN*subN, MPI_FLOAT, srcR, MPI_ANY_TAG, rowComm, MPI_STATUS_IGNORE);

	//north shift columns by column number
	MPI_Sendrecv(col_mat, subN*subN, MPI_FLOAT, destC, 1, col_mat_rec, subN*subN, MPI_FLOAT, srcC, MPI_ANY_TAG, colComm, MPI_STATUS_IGNORE);

	 
	
	for (l=0; l<sqrtP; l++)
	{
		memcpy(row_mat, row_mat_rec, sizeof(float)*subN*subN);
		memcpy(col_mat, col_mat_rec, sizeof(float)*subN*subN);
		
		// Finding the transpose of matrix B
		matrixTranspose(subN, col_mat, col_matT);
		//perform a partial matrix-vector multiplication on each process
		matrixMultiplyT(subN, row_mat, col_matT, can_res);
		
		// find the source and destination in row communicator  - to left shift all rows once
	    if (row_rank != 0) {
			destR = row_rank - 1;
		} else {
			destR = mpi_rowsize - 1;
		}
		
	    srcR = row_rank + 1;
		if (srcR == mpi_rowsize) {
			srcR = 0;
		}
		// find the source and destination in column communicator  - to north shift all columns once
		if (col_rank != 0) {
		destC = col_rank - 1;
		} else {
			destC = mpi_colsize - 1;
		}
		
	    srcC = col_rank + 1;
		if (srcC == mpi_colsize) {
			srcC = 0;
		}
		
		//left shift all rows once
		MPI_Sendrecv(row_mat, subN*subN, MPI_FLOAT, destR, 2, row_mat_rec, subN*subN, MPI_FLOAT, srcR, MPI_ANY_TAG, rowComm, MPI_STATUS_IGNORE);
		
		//north shift all columns once
		MPI_Sendrecv(col_mat, subN*subN, MPI_FLOAT, destC, 3, col_mat_rec, subN*subN, MPI_FLOAT, srcC, MPI_ANY_TAG, colComm, MPI_STATUS_IGNORE);
	}

	// gather the matrix multiplication results from all procs
	MPI_Gatherv(can_res, subN*subN, MPI_DOUBLE, can_out, counts, displs, resized_arrtypeD, ROOT, MPI_COMM_WORLD);
	
		//stop timer
	execEnd = MPI_Wtime();
	execTime = execEnd - execStart;

    //free datatypes
	MPI_Type_free(&resized_arrtype);
	MPI_Type_free(&resized_arrtypeD);
	if (mpi_rank == ROOT)
	{
		printf("Execution time for dot product: %f seconds\n", execTime);
		printf("Result: %f, %f, %f \n ", can_out[0], can_out[2047*N + 2047], can_out[4095*N + 4095]);
		free(vector1);
		free(vector2);
	}
	free(row_mat);
	free(col_mat);
	free(row_mat_rec);
	free(col_mat_rec);
	free(col_matT);
	free(can_res);
	free(can_out);

	//shut down MPI
	MPI_Finalize();

	return 0;
}
Example #26
0
void step4(inst i, int r, int s)
{
    inst instance = i;
    int rank = r;
    int size = s;

    // Creation of the 2D torus we will then use
    MPI_Comm comm;
    int dim[2] = {instance.p, instance.q};
    int period[2] = {1, 1};
    int reorder = 0;
    int coord[2];
    MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &comm);
    MPI_Cart_coords(comm, rank, 2, coord);


    grid global_grid;

    char type = 0;
    MPI_File input_file;

    // We start by reading the header of the file
    MPI_File_open(comm, instance.input_path, MPI_MODE_RDONLY, MPI_INFO_NULL, &input_file);
    MPI_File_read_all(input_file, &type, 1, MPI_CHAR, MPI_STATUS_IGNORE);

    if(type == 1)
    {
	if (rank == 0) fprintf(stderr, "Error: type 1 files are not supported in step 4\n");
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Finalize();
	exit(EXIT_FAILURE);
    }
	
    // we needed to swap the next 2 lines
    MPI_File_read_all(input_file, &(global_grid.n), 1, MPI_UINT64_T, MPI_STATUS_IGNORE);
    MPI_File_read_all(input_file, &(global_grid.m), 1, MPI_UINT64_T, MPI_STATUS_IGNORE);

#ifdef DEBUG
    if(rank == 0)
	printf("n, m = %zu %zu\n", global_grid.n, global_grid.m);
#endif


    if(!(global_grid.n % instance.p == 0 && global_grid.m % instance.q == 0))
    {
	if(rank == 0)
	    fprintf(stderr, "Error: please choose the grid parameters so they divide the grid of the cellular automaton. For example %zu %zu, but you need to move from %d procs to %zu\n", instance.p + (global_grid.n % instance.p), instance.q + (global_grid.m % instance.q), size, (instance.p + (global_grid.n % instance.p))*(instance.q + (global_grid.m % instance.q)));
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Finalize();
	exit(EXIT_FAILURE);
    }

    size_t local_nrows = global_grid.n/instance.p;
    size_t local_ncols = global_grid.m/instance.q;
    
    // Now we create the data structures.
    int blocks[2] = {1, 2};
    MPI_Datatype types[2] = {MPI_BYTE, MPI_DOUBLE};
    MPI_Aint a_size = sizeof(cell2);
    MPI_Aint a_disp[3] = {offsetof(cell2, type), offsetof(cell2, u), offsetof(cell2, s)};

    MPI_Aint p_size = 17;
    MPI_Aint p_disp[3] = {0, 1, 9};

    MPI_Datatype p_tmp, a_tmp, p_cell, a_cell;

    // Aligned struct, memory representation
    MPI_Type_create_struct(2, blocks, a_disp, types, &a_tmp);
    MPI_Type_create_resized(a_tmp, 0, a_size, &a_cell);
    MPI_Type_commit(&a_cell);
	    
    // Packed struct, file-based representation
    MPI_Type_create_struct(2, blocks, p_disp, types, &p_tmp);
    MPI_Type_create_resized(p_tmp, 0, p_size, &p_cell);
    MPI_Type_commit(&p_cell);

    // Now, we create our matrix
    MPI_Datatype matrix;
    int sizes[2] = {global_grid.n, global_grid.m};
    int subsizes[2] = {local_nrows, local_ncols};
    int starts[2] = {0, 0};
    MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, p_cell, &matrix);
    MPI_Type_commit(&matrix);

    // We extend this matrix
    MPI_Datatype ematrix;
    int e_subsizes[2] = {2 + subsizes[0], 2 + subsizes[1]};
    int e_start[2] = {1, 1};
    MPI_Type_create_subarray(2, e_subsizes, subsizes, e_start, MPI_ORDER_C, a_cell, &ematrix);
    MPI_Type_commit(&ematrix);
	

    // The next 3 types are for the export of the grid
    MPI_Datatype d_type;
    MPI_Type_create_resized(MPI_DOUBLE, 0, sizeof(cell2), &d_type);
    MPI_Type_commit(&d_type);
	

    MPI_Datatype d_matrix;
    MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &d_matrix);
    MPI_Type_commit(&d_matrix);

    MPI_Datatype d_rmatrix; // to go from the extended matrix with ghost zones to the other one
    MPI_Type_create_subarray(2, e_subsizes, subsizes, e_start, MPI_ORDER_C, d_type, &d_rmatrix);
    MPI_Type_commit(&d_rmatrix);



    // Set file view for each element
    MPI_Offset grid_start;
    MPI_File_get_position(input_file, &grid_start);

	
    MPI_File_set_view(input_file, grid_start + global_grid.m*local_nrows*p_size*coord[0] + local_ncols*p_size*coord[1], p_cell, matrix, "native", MPI_INFO_NULL);

    // allocate the cell array we will use
    cell2 **cells;
    cells = malloc(2*sizeof(cell2 *));
    double *sensors;
	
    cells[1] = calloc((2+local_nrows)*(2+local_ncols),sizeof(cell2));
    cells[0] = calloc((2+local_nrows)*(2+local_ncols),sizeof(cell2));
    sensors = calloc(local_nrows*local_ncols, sizeof(double));
	
    MPI_File_read_all(input_file, cells[0], 1, ematrix, MPI_STATUS_IGNORE);

    MPI_File_close(&input_file);

#ifdef DEBUG
    for(size_t i = 1; i < 1+local_nrows; i++)
	for(size_t j = 1; j < 1+local_ncols; j++)
	    fprintf(stderr, "%d - %d %f\n", rank, cells[0][i*(2+local_ncols)+j].type, cells[0][i*(2+local_ncols)+j].u);
#endif

    MPI_Datatype l_row; // local row
    MPI_Type_contiguous(local_ncols, d_type, &l_row);
    MPI_Type_commit(&l_row);

    MPI_Datatype l_col; // local column. A bit trickier, we need a type_vector.
    MPI_Type_vector(local_nrows, 1, local_ncols+2, d_type, &l_col);
    MPI_Type_commit(&l_col);

	
    int top, bot, left, right;
    double sqspeed = 0;

    int curr = 0, next = 0;
    char *alldump = malloc(256);

    for(int s = 0; s < instance.iteration; s++)
    {
	// We will update cell[next], and use the data of cell[curr]
	curr = s % 2;
	next = (s+1) % 2;
	    
	// We copy the edges of the grid.
	// We first need the ranks of the neighbours

	MPI_Cart_shift(comm, 0, 1, &top, &bot);
	MPI_Cart_shift(comm, 1, 1, &left, &right);
	    

	// Then we need to update the edges of our local grid
	// Update top and bottom rows
	MPI_Sendrecv(&(cells[curr][1*(local_ncols+2)+1].u),               1, l_row, top, 0,
		     &(cells[curr][(local_ncols+2)*(local_nrows+1)+1].u), 1, l_row, bot, 0,
		     comm, MPI_STATUS_IGNORE);
	
	MPI_Sendrecv(&(cells[curr][(local_ncols+2)*(local_nrows)+1].u),   1, l_row, bot, 0,
		     &(cells[curr][1].u),                                 1, l_row, top, 0,
		     comm, MPI_STATUS_IGNORE);
	
	// Update left and right
	MPI_Sendrecv(&(cells[curr][1*(local_ncols+2)+1].u),             1, l_col, left,  0,
		     &(cells[curr][1*(local_ncols+2)+local_ncols+1].u), 1, l_col, right, 0,
		     comm, MPI_STATUS_IGNORE);

	MPI_Sendrecv(&(cells[curr][1*(local_ncols+2)+local_ncols].u),   1, l_col, right, 0,
		     &(cells[curr][1*(local_ncols+2)].u),               1, l_col, left,  0,
		     comm, MPI_STATUS_IGNORE);



	// We compute the update of the grid
	for(size_t i = 1; i < 1+local_nrows; i++)
	{
	    for(size_t j = 1; j < 1+local_ncols; j++)
	    {
		if(instance.step < 2 || cells[next][j+i*(2+local_ncols)].type != 1)
		{
		    // If walls we do not do anything
		    sqspeed = cells[0][j+i*(2+local_ncols)].s * cells[0][j+i*(2+local_ncols)].s;
		    cells[next][j+i*(2+local_ncols)].u = cells[curr][j+i*(2+local_ncols)].u + (cells[curr][j+i*(2+local_ncols)].v * instance.dt);
		    cells[next][j+i*(2+local_ncols)].v = cells[curr][j+i*(2+local_ncols)].v + sqspeed * (cells[curr][j+(i+1)*(2+local_ncols)].u + cells[curr][j+(i-1)*(2+local_ncols)].u + cells[curr][(j+1) + i*(2+local_ncols)].u + cells[curr][(j-1) + i*(2+local_ncols)].u - (4 * cells[curr][j+i*(2+local_ncols)].u)) * instance.dt;

		    if(instance.step == 3 && cells[next][j+i*(2+local_ncols)].type == 2)
		    {
			// Case of sensors
			sensors[(j-1)+(i-1)*local_ncols] += cells[next][j+i*(2+local_ncols)].u * cells[next][j+i*(2+local_ncols)].u;
		    }
		}
		    
	    }
	}

	if(instance.alldump != NULL && s % instance.frequency == 0)
	{
	    MPI_File dump_file;

	    sprintf(alldump, instance.alldump, (s / instance.frequency));
	    MPI_File_open(comm, alldump, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &dump_file);
		
	    MPI_File_set_view(dump_file, global_grid.m*local_nrows*sizeof(double)*coord[0] + local_ncols*sizeof(double)*coord[1], MPI_DOUBLE, d_matrix, "native", MPI_INFO_NULL);
		
	    MPI_File_write_all(dump_file, &(cells[curr][0].u), 1, d_rmatrix, MPI_STATUS_IGNORE);
	    MPI_File_close(&dump_file);


	}
    }

	
    if(instance.lastdump != NULL)
    {
	// bon, comment on fait ça ? peut être qu'en faisant un resize ça marche ?
	MPI_File last_file;
	MPI_File_open(comm, instance.lastdump, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &last_file);
	MPI_File_set_view(last_file, global_grid.m*local_nrows*sizeof(double)*coord[0] + local_ncols*sizeof(double)*coord[1], MPI_DOUBLE, d_matrix, "native", MPI_INFO_NULL); // déjà, il y a un grid_strat en trop, d_type ou MPI_DOUBLE ?

	MPI_File_write_all(last_file, &(cells[next][0].u), 1, d_rmatrix, MPI_STATUS_IGNORE);
	MPI_File_close(&last_file);
    }

    if(instance.step == 3 && instance.sensors != NULL)
    {
	MPI_File sensor_file;
	MPI_File_open(comm, instance.sensors, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &sensor_file);


	MPI_Datatype string;
	MPI_Type_contiguous(1024, MPI_CHAR, &string);
	MPI_Type_commit(&string);
	
	char text[1024];
	for(size_t i = 1; i < 1+local_nrows; i++)
	{
	    for(size_t j = 1; j < 1+local_ncols; j++)
	    {
		if(instance.step == 3 && cells[next][j+i*(2+local_ncols)].type == 2)
		{
		    memset(text,0,sizeof(text));
		    sprintf(text, "%zu %zu %f\r\n", (i-1)+coord[0]*local_nrows, (j-1)+coord[1]*local_ncols, sensors[(j-1)+(i-1)*local_ncols]);
		    MPI_File_write(sensor_file, text, 1, string, MPI_STATUS_IGNORE);
		}
		    
	    }
	}
	    
	MPI_Type_free(&string);
	MPI_File_close(&sensor_file);
    }
	

    // Some cleaning
    free(cells);
    free(alldump);
    MPI_Type_free(&a_cell);
    MPI_Type_free(&p_cell);
    MPI_Type_free(&matrix);
    MPI_Type_free(&ematrix);
    MPI_Type_free(&d_type);
    MPI_Type_free(&d_matrix);
    MPI_Type_free(&d_rmatrix);
    MPI_Type_free(&l_row);
    MPI_Type_free(&l_col);
}
Example #27
0
int main(int argc, char* argv[]){
  MPI_Init(NULL, NULL);
  int rank, size;
  int loop, num_alive, loop_iterations;
  int ldboard, ldnbngb, ldglobalboard;
  double t1, time, final_time;
  int periods[2] = {1, 1};
  int *globboard= NULL;
  int *globboard2= NULL;
  int *board;
  int *nbngb;

  /* Initialization of MPI */
  MPI_Comm_rank( MPI_COMM_WORLD, &rank );
  MPI_Comm_size( MPI_COMM_WORLD, &size);
  if(argc >= 2){
    if(!strcmp("-h",argv[1])){
      if(!rank)
	helper();
      MPI_Finalize();
      return EXIT_SUCCESS;
    }
  }
  int i, j;
  int process_per_row = sqrt(size);
  int process_per_column = sqrt(size);
  int dims[2] = {process_per_row, process_per_column};
  
  // It only works if the number of process in the input is a perfect square
  if(size != process_per_column*process_per_row){
    fprintf(stderr, "Square Perfect needed as input size.\nExiting Program.");
    MPI_Finalize();
    return EXIT_FAILURE;
  }

  MPI_Comm grid;

  // Initialize cartesian grid
  MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods,0, &grid);
  MPI_Comm_rank(grid, &rank);

  /* User input */
  if (argc < 2) {
    loop_iterations = 10;
    BS = 30;
  } else if (argc >= 2){
    loop_iterations = atoi(argv[1]);
    if(argc > 2)
      BS = atoi(argv[2]);
    else
      BS = 30;
  }
  num_alive = 0;

  /*Leading dimension of global board array*/
  ldglobalboard = BS + 2; // +2 because of upper and above added (+ X +)
  /* Leading dimension of board array */
  ldboard = BS/process_per_row + 2; // +2 because of upper and above added (+ X +)
  /* Leading dimension of neigbour array */
  ldnbngb = BS/sqrt(size); // Same number of element in each process which is equal to this formula

  // Initialization of cells board
  board = (int *)malloc( ldboard * ldboard * sizeof(int) );
  nbngb = (int *)malloc( ldnbngb * ldnbngb * sizeof(int) );

  // Initialization of global cell board (which is common between all processes)
  if(!rank){
    globboard = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int));
    globboard2 = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int));
    num_alive = generate_initial_board( BS, &globboard[1+ldglobalboard] , ldglobalboard );
    output_board( BS, &globboard[1+ldglobalboard], ldglobalboard, 0 );
    fprintf(stderr, "Starting number of living cells = %d\n", num_alive);
  }

  // Matrix block type used by each processes
  MPI_Datatype block2, block;
  MPI_Type_vector(ldboard-2, ldboard-2, ldglobalboard, MPI_INT, &block2);
  MPI_Type_create_resized(block2, 0, sizeof(int), &block);
  MPI_Type_commit(&block);

  // Matrix sub block type used by each processes
  MPI_Datatype sub_block2, sub_block;
  MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &sub_block2);
  MPI_Type_create_resized(sub_block2, 0, sizeof(int), &sub_block);
  MPI_Type_commit(&sub_block);

  int *process_count = (int*)malloc(size*sizeof(int));  
  // number of cells per processes
  int *cell_per_processes = (int*)malloc(size*sizeof(int));

  // Prototyping moves for each processes (preparing matrix's scatter)
  for (i = 0; i < process_per_row; ++i){
    for (j = 0; j < process_per_column; ++j){
      process_count[i+j*process_per_column]= 1;
      cell_per_processes[i+j*process_per_column]= i*ldglobalboard*(ldboard-2)+j*(ldboard-2);
    }
  }

  /* Explodes matrix into sub_blocks elements */
  MPI_Scatterv(&globboard[1+ldglobalboard], process_count, cell_per_processes, block, &board[ldboard+1], 1, sub_block,0, grid);

  // Initialize for each processes, a table of the neighbours.
  int neighbours[8];
  neighbour_table(neighbours, grid, rank);

  /* Time to begin */
  t1 = mytimer();
  int blocksize = ldboard-2;
  MPI_Datatype row_blocks;
  MPI_Type_vector(blocksize, 1, ldboard, MPI_INT, &row_blocks);
  MPI_Type_commit(&row_blocks);

  // status for waiting time...
  MPI_Status mpi_status;

  // Create as much MPI request as number of neighbours possible (in the worst case 8) 
  MPI_Request cart_request[8];
  for (loop = 1; loop <= loop_iterations; ++loop) {
    /* Start communications to send and recv informations from neighbours */
    inter_proc_communications(cart_request, neighbours, grid, blocksize, board, ldboard, row_blocks);

    /* Compute inside process cells */
    for (j = 2; j <= blocksize-1; ++j) {
      for (i = 2; i <= blocksize-1; ++i) {
	ngb( i, j ) =
	  cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) +
	  cell( i-1, j   ) +                  cell( i+1, j   ) +
	  cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 );
      }
    }

    /* Computes cells on the border */

    // Cell neighbour's composition
    
    // 4 2 5       4           4 2 5       4 2 5       4 2 5 //
    // 0 X 1  -->  0      -->  0      -->  0   1  -->  0   1 //
    // 6 3 7       6           6           6   7       6 3 7 //
    
    /* Column on the left needs data from the left process --> 4, 0, 6*/ 
    MPI_Wait(&cart_request[0], &mpi_status);
    MPI_Wait(&cart_request[4], &mpi_status);
    MPI_Wait(&cart_request[6], &mpi_status);
    process_frontier(1, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb);

    /* Line above needs data from the above process --> 2, 5 */
    MPI_Wait(&cart_request[2], &mpi_status);
    MPI_Wait(&cart_request[5], &mpi_status);
    process_frontier(1, blocksize, board, ROW, ldboard, nbngb, ldnbngb);

    /* Column on the right needs data from the right process --> 1, 7 */
    MPI_Wait(&cart_request[1], &mpi_status);
    MPI_Wait(&cart_request[7], &mpi_status);
    process_frontier(blocksize, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb);

    /* Line under needs data from under process --> 3 */
    MPI_Wait(&cart_request[3], &mpi_status);
    process_frontier(blocksize, blocksize, board, ROW, ldboard, nbngb, ldnbngb);


    /* Update the cell */
    num_alive = 0;
    for (j = 1; j <= blocksize; ++j) {
      for (i = 1; i <= blocksize; ++i) {
	if ( (ngb( i, j ) < 2) ||
	     (ngb( i, j ) > 3) ) {
	  cell(i, j) = 0;
	}
	else {
	  if ((ngb( i, j )) == 3)
	    cell(i, j) = 1;
	}
	if (cell(i, j) == 1) {
	  num_alive+=1;
	}
      }
    }
    printf("%d \n", num_alive);
  }

  /* Reassembles matrix into one from the sub blocks in the block */
  MPI_Gatherv(&board[ldboard+1], 1, sub_block, &globboard2[1+ldglobalboard], process_count, cell_per_processes, block, 0, grid);

  /* Reduction to determine max time execution */
  time = mytimer() - t1;
  MPI_Allreduce(&time, &final_time, 1,MPI_DOUBLE, MPI_MAX, grid);
  
  /* Reduction to determine number of cells still alive in all processes */
  MPI_Allreduce(MPI_IN_PLACE, &num_alive, 1, MPI_INT, MPI_SUM, grid);
  
  /* The END */
  if(!rank){
    // Combien de cellules sont en PLS à la fin de la soirée ?
    printf("Final number of living cells = %d\n", num_alive);
    printf("time=%.2lf ms\n",(double)time * 1.e3);
    char str [100];
    // create debug file 
    sprintf(str, "mpi_debug_%d.dat", size);
    FILE *fd = NULL;
    fd=fopen(str, "w");
    // JUST TELL ME IF IT WORKS !!
    if (fd != NULL)
      fprintf(fd,"%.2lf", time*1.e3);
    else
      exit(EXIT_FAILURE);
    fclose(fd);
    output_board( BS, &globboard2[1+ldglobalboard], ldglobalboard, loop_iterations);
  }
  // FREE ALL
  free(process_count);
  free(cell_per_processes);
  free(board);
  free(nbngb);
  MPI_Finalize();
  // The final end
  return EXIT_SUCCESS;
}
Example #28
0
int main(int argc, char* argv[])
{
    int i, j, loop, num_alive, maxloop;
    int ldgboard,ldboard, ldnbngb;
    double t1, t2;
    double temps;
    int *gboard;
    int *board;
    int *nbngb;

    int size;
    int coord[2], id;
    int procs_per_lines_col;

    MPI_Init(NULL,NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    procs_per_lines_col = sqrt(size);
    if(procs_per_lines_col * procs_per_lines_col != size) {
      fprintf(stderr, "Renseignez un nombre carré de processeurs siouplait !\n");
      MPI_Finalize();
      exit(EXIT_FAILURE);
    }

    int dims[2]; dims[0] = procs_per_lines_col; dims[1] = procs_per_lines_col;
    int periods[2]; periods[0] = 1; periods[1] = 1;
    MPI_Comm comm_cart;
    
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_cart);
    MPI_Comm_rank(comm_cart, &id);
    MPI_Cart_coords(comm_cart, id, 2, coord);

    if (argc < 3) {
	printf("Usage: %s nb_iterations size\n", argv[0]);
	return EXIT_SUCCESS;
    } else {
	maxloop = atoi(argv[1]);
	BS = atoi(argv[2]);
	//printf("Running sequential version, grid of size %d, %d iterations\n", BS, maxloop);
    }
    num_alive = 0;


    //Generate the neighbours table
    
    /* Leading dimension of the global board array */
    ldgboard = BS + 2;
    /* Leading dimension of the board array */
    ldboard = BS/procs_per_lines_col + 2;
    /* Leading dimension of the neigbour counters array */
    ldnbngb = BS/procs_per_lines_col;

    board = malloc( ldboard * ldboard * sizeof(int) );
    nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) );
    
    if(id == 0) {
      gboard = malloc(ldgboard * ldgboard * sizeof(int));
      num_alive = generate_initial_board( BS, &gboard[1+ldgboard], ldgboard );
      //fprintf(stderr,"Starting number of living cells = %d\n", num_alive);
    }

    MPI_Datatype block;
    MPI_Type_vector(ldboard-2, ldboard-2, ldgboard, MPI_INT, &block);
    MPI_Type_create_resized(block, 0, sizeof(int), &block);
    MPI_Type_commit(&block);

    MPI_Datatype subblock;
    MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &subblock);
    MPI_Type_create_resized(subblock, 0, sizeof(int), &subblock);
    MPI_Type_commit(&subblock);
    
    int * counts = (int*) malloc(size*sizeof(int));
    int * displs = (int*) malloc(size*sizeof(int));
    // Définition des déplacements pour chaque proc
    for (int i = 0; i < procs_per_lines_col; ++i)
      {
	for (int j = 0; j < procs_per_lines_col; ++j)
	  {
	    counts[i+j*procs_per_lines_col]= 1;
	    displs[i+j*procs_per_lines_col]= i*ldgboard*(ldboard-2)+j*(ldboard-2);
	  }
      }
    MPI_Scatterv(&gboard[1+ldgboard], counts, displs, block, &board[ldboard+1], 1,
				subblock,0, comm_cart);
    

    int neighbours[8];
    make_neighbours_table(neighbours, comm_cart);    
    MPI_Request req[8];

    int block_size = ldboard - 2;
    MPI_Datatype block_line;
    MPI_Type_vector(block_size+2, 1, ldboard,MPI_INT, &block_line);
    MPI_Type_commit(&block_line);

    t1 = mytimer();

    for (loop = 1; loop <= maxloop; loop++) {
      make_communications(req, comm_cart, neighbours, block_size, board, ldboard, block_line);
	  
	  /*	cell(   0, 0   ) = cell(BS, BS);
	cell(   0, BS+1) = cell(BS,  1);
	cell(BS+1, 0   ) = cell( 1, BS);
	cell(BS+1, BS+1) = cell( 1,  1);

	for (i = 1; i <= BS; i++) {
	    cell(   i,    0) = cell( i, BS);
	    cell(   i, BS+1) = cell( i,  1);
	    cell(   0,    i) = cell(BS,  i);
	    cell(BS+1,    i) = cell( 1,  i);
	}
	  */

      //Inner cells 
	for (j = 2; j <= block_size; j++) {
	    for (i = 2; i <= block_size; i++) {
		ngb( i, j ) =
		    cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) +
		    cell( i-1, j   ) +                  cell( i+1, j   ) +
		    cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 );
	    }
	}

	//On LEFT
	MPI_Wait(&req[0], MPI_STATUS_IGNORE);
	MPI_Wait(&req[4], MPI_STATUS_IGNORE);
	MPI_Wait(&req[6], MPI_STATUS_IGNORE);
	//CALCUL LIGNE GAUCHE
	for(j = 1; j <= block_size; j++) {
	  ngb( 1, j ) =
	    cell( 0, j-1 ) + cell( 1, j-1 ) + cell( 2, j-1 ) +
	    cell( 0, j   ) +                  cell( 2, j   ) +
	    cell( 0, j+1 ) + cell( 1, j+1 ) + cell( 2, j+1 );
	}
	
	//On TOP
	MPI_Wait(&req[1], MPI_STATUS_IGNORE);
	MPI_Wait(&req[5], MPI_STATUS_IGNORE);
	//CALCUL LIGNE DESSUS
	for(i = 1; i <= block_size; i++) {
	  ngb( i, 1 ) =
	    cell( i - 1, 0) + cell( i, 0 ) + cell( i + 1, 0 ) +
	    cell( i - 1, 1) +                cell( i + 1, 1 ) +
	    cell( i - 1, 2) + cell( i, 2 ) + cell( i + 1, 2 );
	}


	//On RIGHT
	MPI_Wait(&req[2], MPI_STATUS_IGNORE);
	MPI_Wait(&req[7], MPI_STATUS_IGNORE);
	//CALCULER A DROITE
	for(j = 1; j <= block_size; j++) {
	  ngb( block_size, j ) =
	    cell( block_size - 1, j-1 ) + cell( block_size , j-1 ) + cell( block_size + 1, j-1 ) +
	    cell( block_size - 1, j   ) +                            cell( block_size + 1, j   ) +
	    cell( block_size - 1, j+1 ) + cell( block_size, j+1 ) + cell(  block_size + 1, j+1 );
	}
	

	
	//ON BOT
	MPI_Wait(&req[3], MPI_STATUS_IGNORE);
	//CALCULER EN BAS
	for(i = 1; i <= block_size; i++) {
	  ngb( i, block_size ) =
	    cell( i - 1, block_size - 1) + cell( i, block_size - 1 ) + cell( i + 1, block_size - 1 ) +
	    cell( i - 1, block_size ) +                cell( i + 1, block_size ) +
	    cell( i - 1, block_size + 1 ) + cell( i, block_size + 1 ) + cell( i + 1, block_size + 1 );
	}


	num_alive = 0;
	for (j = 1; j <= block_size; j++) {
	    for (i = 1; i <= block_size; i++) {
		if ( (ngb( i, j ) < 2) ||
		     (ngb( i, j ) > 3) ) {
		    cell(i, j) = 0;
		}
		else {
		    if ((ngb( i, j )) == 3)
			cell(i, j) = 1;
		}
		if (cell(i, j) == 1) {
		    num_alive ++;
		}
	    }
	}

        /* Avec les celluls sur les bords (utile pour vérifier les comm MPI) */
        /* output_board( BS+2, &(cell(0, 0)), ldboard, loop ); */

        /* Avec juste les "vraies" cellules: on commence à l'élément (1,1) */
	//output_board( BS, &(cell(1, 1)), ldboard, loop);

	//printf("%d cells are alive\n", num_alive);
    }
    MPI_Gatherv(&board[ldboard+1], 1, subblock,&gboard[ldgboard+1], counts,displs, block, 0, comm_cart);

    t2 = mytimer();

    temps = t2 - t1;
    MPI_Allreduce(MPI_IN_PLACE,&temps, 1, MPI_DOUBLE, MPI_MAX, comm_cart);
    MPI_Allreduce(MPI_IN_PLACE,&num_alive, 1, MPI_INT, MPI_SUM, comm_cart);
    if(id == 0) {
      //printf("Final number of living cells = %d\n", num_alive);
      printf("%.2lf\n",(double)temps * 1.e3);
    }
    free(board);
    free(nbngb);
    MPI_Finalize();
    return EXIT_SUCCESS;
}
Example #29
0
int main(int argc, char ** argv)
{
	int ncid, dimid, varid;
	MPI_Init(&argc, &argv);
	MPI_Datatype vtype, rtype, usertype;
	MPI_Aint lb, extent;
	int userbufsz, *userbuf, *cmpbuf, i, errs=0;
	int count = 25;
	double pi = 3.14159;
	MPI_Offset start, acount;

	ncmpi_create(MPI_COMM_WORLD, "vectors.nc", NC_CLOBBER, MPI_INFO_NULL,
			&ncid);
	ncmpi_def_dim(ncid, "50k", 1024*50, &dimid);
	ncmpi_def_var(ncid, "vector", NC_DOUBLE, 1, &dimid, &varid);

	ncmpi_enddef(ncid);


	MPI_Type_vector(VECCOUNT, BLOCKLEN, STRIDE, MPI_INT, &vtype);
	MPI_Type_create_resized(vtype, 0, STRIDE*VECCOUNT*sizeof(int), &rtype);
	MPI_Type_contiguous(count, rtype, &usertype);
	MPI_Type_commit(&usertype);

	MPI_Type_free(&vtype);
	MPI_Type_free(&rtype);

	MPI_Type_get_extent(usertype, &lb, &extent);
	userbufsz = extent;
	userbuf = malloc(userbufsz);
	cmpbuf = calloc(userbufsz, 1);
	for (i=0; i< userbufsz/sizeof(int); i++) {
		userbuf[i] = pi*i;
	}


	start = 10; acount = count*12;
	ncmpi_begin_indep_data(ncid);
	ncmpi_put_vara(ncid, varid, &start, &acount, 
			userbuf, 1, usertype);

	ncmpi_close(ncid);

	NC_CHECK(ncmpi_open(MPI_COMM_WORLD, "vectors.nc", NC_NOWRITE,
				MPI_INFO_NULL, &ncid));
	ncmpi_begin_indep_data(ncid);
	NC_CHECK(ncmpi_inq_varid(ncid, "vector", &varid));
	NC_CHECK(ncmpi_get_vara(ncid, varid, &start, &acount,
			cmpbuf, 1, usertype));
	ncmpi_close(ncid);

	for (i=0; errs < 10 &&  i < acount; i++) {
		/* vector of 4,3,5, so skip 4th and 5th items of every block */
		if (i%STRIDE >= BLOCKLEN) continue;
		if (userbuf[i] != cmpbuf[i]) {
			errs++;
			fprintf(stderr, "%d: expected 0x%x got 0x%x\n", 
					i, userbuf[i], cmpbuf[i]);
		}
	}
	free(userbuf);
	free(cmpbuf);
	MPI_Type_free(&usertype);
	MPI_Finalize();
	return 0;
}
Example #30
0
int main(int argc, char **argv)
{
    int i, j, k;
    double start, end;
    /* Time array */
    double time[9];
	double comm_time = 0;
	double comp_time = 0;
    int chunkSize;
    MPI_Status status;
    /* Being used in FFT */
    float data[N][N];
    /* Being used in mm */
    float input_1[N][N], input_2[N][N];
    /* Local matrix for FFT */
    float local_data[N][N];

    /* World rank and processor, related to MPI_COMM_WORLD */
    int world_id;
    int world_processor;

    /* Divided rank and processors for communication, related to taskcomm */
    int task_id;
    int task_processor;

    /* A complex array  storing the temp row to operate FFT */
    complex temp_data[N];

    /* Initialize rank and the number of processor for the MPI */
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_id);
    MPI_Comm_size(MPI_COMM_WORLD, &world_processor);

    /* Initialize a new vector for distributing columns */
    MPI_Datatype column, col;
    /* Column vector */
    MPI_Type_vector(N, 1, N, MPI_FLOAT, &col);
    MPI_Type_commit(&col);
    MPI_Type_create_resized(col, 0, 1*sizeof(float), &column);
    MPI_Type_commit(&column);

    int task = world_id%4;
    MPI_Comm taskcomm;
    /* Split the MPI_COMM_WORLD */
    MPI_Comm_split(MPI_COMM_WORLD, task, world_id, &taskcomm);
    MPI_Comm_rank(taskcomm, &task_id);
    MPI_Comm_size(taskcomm, &task_processor);

    /* Initialize inter communicators */
    MPI_Comm t1_t3_comm, t2_t3_comm, t3_t4_comm;

    /* Calculate chunkSize */
    chunkSize = N/task_processor;

    /* Get the start time of all program */
    if(world_id == 0){
        printf("2D convolution using MPI task and data parallelism\n");
        start = MPI_Wtime();
    }

    /* Each group completes work and send results by inter communicators */
    if(task == 0){
        // task 1
        /* Create an inter communicator for task 1 and task 3 */
        MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 1, &t1_t3_comm);

        if(task_id == 0){
            time[0] = MPI_Wtime();

            /* Read file */
            readIm1File(data);
            time[1] = MPI_Wtime();

            printf("Group 1: Reading file 1_im1 takes %f s.\n", time[1] - time[0]);
        }

        /* Scatter data to local ranks */
        MPI_Scatter(data, chunkSize*N, MPI_FLOAT,
                    local_data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);

        /* Compute time for distributing data */
        if(task_id == 0){
            time[2] = MPI_Wtime();
            printf("Group 1: Scattering 1_im1(row) to each processor takes %f s.\n", time[2] - time[1]);
        }

        /* Do 1_im1 2d FFT */
        /* Row FFT */
        for(i = 0; i < chunkSize; i++){
            for(j = 0; j < N; j++){
                /* FFT each row for im1 */
                temp_data[j].r = local_data[i][j];
                temp_data[j].i = 0;
            }

            c_fft1d(temp_data, N, -1);

            for(j = 0; j < N; j++)
                local_data[i][j] = temp_data[j].r;
        }

        /* Gather all the data and distribute in columns */
        if(task_id == 0){
            time[3] = MPI_Wtime();
            printf("Group 1: FFT each row for 1_im1 takes %f s.\n", time[3] - time[2]);
        }

        /* Gather all the data of 1_im1 */
        MPI_Gather(local_data, chunkSize*N, MPI_FLOAT,
                    data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);

        if(task_id == 0){
            time[4] = MPI_Wtime();
            printf("Group 1: Gathering all the data of 1_im1(row) takes %f s.\n", time[4] - time[3]);
        }

        /* Scatter all the data to column local data */
        MPI_Scatter(data, chunkSize, column,
                    local_data, chunkSize, column,
                    0, taskcomm);

        if(task_id == 0){
            time[5] = MPI_Wtime();
            printf("Group 1: Scattering 1_im1(column) to each processor takes %f s.\n", time[5] - time[4]);
        }

        /* Column FFT */
        for(i = 0; i < chunkSize; i++){
            for(j = 0; j < N; j++){
                /* FFT each column for im1 */
                temp_data[j].r = local_data[j][i];
                temp_data[j].i = 0;
            }

            c_fft1d(temp_data, N, -1);

            for(j = 0; j < N; j++)
                local_data[j][i] = temp_data[j].r;
        }

        /* Gather all the columns from each rank */
        if(task_id == 0){
            time[6] = MPI_Wtime();
            printf("Group 1: FFT each column for 1_im1 takes %f s.\n", time[6] - time[5]);
        }

        MPI_Gather(local_data, chunkSize, column,
                    data, chunkSize, column,
                    0, taskcomm);

        /* Compute time and distribute data to do matrix multiplication */
        if(task_id == 0){
            time[7] = MPI_Wtime();
            printf("Group 1: Gathering all the data of 1_im1(column) takes %f s.\n", time[7] - time[6]);
            /* Total time */
            printf("Group 1: Total time for task 1 in group 1 takes %f s.\n", time[7] - time[0]);

			comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1];
			comp_time += time[6] - time[5] + time[3] - time[2];
            /* Send data to group 3 via the inter communicator */
            MPI_Send(data, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm);
        }
    }
    else if(task == 1){
        // Task 2
        /* Create an inter communicator for task 2 and task 3 */
        MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 2, &t2_t3_comm);

        if(task_id == 0){
            time[0] = MPI_Wtime();

            /* Read file */
            readIm2File(data);
            time[1] = MPI_Wtime();

            printf("Group 2: Reading file 1_im2 takes %f s.\n", time[1] - time[0]);
        }

        /* Scatter data to local ranks */
        MPI_Scatter(data, chunkSize*N, MPI_FLOAT,
                    local_data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);

        /* Compute time for distributing data */
        if(task_id == 0){
            time[2] = MPI_Wtime();
            printf("Group 2: Scatter 1_im2(row) to each processor takes %f s.\n", time[2] - time[1]);
        }

        /* Do 1_im1 2d FFT */
        /* Row FFT */
        for(i = 0; i < chunkSize; i++){
            for(j = 0; j < N; j++){
                /* FFT each row for im1 */
                temp_data[j].r = local_data[i][j];
                temp_data[j].i = 0;
            }

            c_fft1d(temp_data, N, -1);

            for(j = 0; j < N; j++)
                local_data[i][j] = temp_data[j].r;
        }

        /* Gather all the data and distribute in columns */
        if(task_id == 0){
            time[3] = MPI_Wtime();
            printf("Group 2: FFT each row for 1_im2 takes %f s.\n", time[3] - time[2]);
        }

        /* Gather all the data of 1_im1 */
        MPI_Gather(local_data, chunkSize*N, MPI_FLOAT,
                    data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);

        if(task_id == 0){
            time[4] = MPI_Wtime();
            printf("Group 2: Gather all the data of 1_im2(row) takes %f s.\n", time[4] - time[3]);
        }

        /* Scatter all the data to column local data */
        MPI_Scatter(data, chunkSize, column,
                    local_data, chunkSize, column,
                    0, taskcomm);

        if(task_id == 0){
            time[5] = MPI_Wtime();
            printf("Group 2: Scatter 1_im2(column) to each processor takes %f s.\n", time[5] - time[4]);
        }

        /* Column FFT */
        for(i = 0; i < chunkSize; i++){
            for(j = 0; j < N; j++){
                /* FFT each column for im1 */
                temp_data[j].r = local_data[j][i];
                temp_data[j].i = 0;
            }

            c_fft1d(temp_data, N, -1);

            for(j = 0; j < N; j++)
                local_data[j][i] = temp_data[j].r;
        }

        /* Gather all the columns from each rank */
        if(task_id == 0){
            time[6] = MPI_Wtime();
            printf("Group 2: FFT each column for 1_im2 takes %f s.\n", time[6] - time[5]);
        }

        MPI_Gather(local_data, chunkSize, column,
                    data, chunkSize, column,
                    0, taskcomm);

        /* Compute time and distribute data to do matrix multiplication */
        if(task_id == 0){
            time[7] = MPI_Wtime();
            printf("Group 2: Gather all the data of 1_im2(column) takes %f s.\n", time[7] - time[6]);
            /* Total time */
            printf("Group 2: Total time for task 2 in group 2 takes %f s.\n", time[7] - time[0]);
			
			comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1];
			comp_time += time[6] - time[5] + time[3] - time[2];
            /* Send data to group 3 via the inter communicator */
            MPI_Send(data, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm);
        }
    }
    else if(task == 2){
        // Task 3
        /* Local matrix for matrix multiplication */
        float local_data2[chunkSize][N];
        /* Create inter communicators for task 1 and task3, task 2 and task 3, task 3 and task 4 */
        MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 0, 1, &t1_t3_comm);
        MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 1, 2, &t2_t3_comm);
        MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 3, 3, &t3_t4_comm);

        /* Receive data from group 1 and group 2 */
        if(task_id == 0){
            time[0] = MPI_Wtime();

            MPI_Recv(input_1, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm, &status);
            MPI_Recv(input_2, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm, &status);

            time[1] = MPI_Wtime();

            /* Time of receiving data from group 1 and group 2 */
            printf("Group 3: Receiving data from group 1 and group 2 takes %f s.\n", time[1] - time[0]);
        }

        /* Do matrix multiplication */
        MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT,
                    local_data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);
        /* Broadcast data2 to all the ranks */
        MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, taskcomm);

        if(task_id == 0){
            time[2] = MPI_Wtime();
            printf("Group 3: Scattering data for multiplication takes %f s.\n", time[2] - time[1]);
        }

        /* Matrix multiplication */
        for(i = 0; i < chunkSize; i++)
            for(j = 0; j < N; j++){
                local_data2[i][j] = 0;
                for(k = 0; k < N; k++)
                    local_data2[i][j] += local_data[i][k]*input_2[k][j];
            }

        /* Collect multiplication result from each rank */
        if(task_id == 0){
            time[3] = MPI_Wtime();
            printf("Group 3: Matrix multiplication takes %f s.\n", time[3] - time[2]);
        }

        /* Gather data */
        MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT,
                   data, chunkSize*N, MPI_FLOAT,
                   0, taskcomm);

        if(task_id == 0){
            time[4] = MPI_Wtime();
            printf("Group 3: Gathering data after Matrix multiplication takes %f s.\n", time[4] - time[3]);
            /* total time */
            printf("Group 3: Total time for task 3 in group 3 takes %f s.\n", time[4] - time[0]);
            /* send result of matrix multiplication to group 4 */
            MPI_Send(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm);
        }
		
		comm_time += time[4] - time[3] + time[2] - time[0];
		comp_time += time[3] - time[2];

        MPI_Comm_free(&t1_t3_comm);
        MPI_Comm_free(&t2_t3_comm);
    }
    else{
        // Task 4
        /* Create an inter communicator for task 3 and task 4 */
        MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 3, &t3_t4_comm);

        /* Receive data from group 3 */
        if(task_id == 0){
            time[0] = MPI_Wtime();

            MPI_Recv(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm, &status);

            time[1] = MPI_Wtime();
            printf("Group 4: Receiving data from group 3 takes %f s.\n", time[1] - time[0]);
        }

        /* Scatter data to each processor */
        MPI_Scatter(data, chunkSize*N, MPI_FLOAT,
                    local_data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);

        if(task_id == 0){
            time[2] = MPI_Wtime();
            printf("Group 4: Scattering data of rows to each processor takes %f s.\n", time[2] - time[1]);
        }

        /* Inverse-2DFFT(row) */
        for(i = 0; i < chunkSize; i++){
            for(j = 0; j < N; j++){
                /* FFT each row for im1 */
                temp_data[j].r = local_data[i][j];
                temp_data[j].i = 0;
            }

            c_fft1d(temp_data, N, 1);

            for(j = 0; j < N; j++)
                local_data[i][j] = temp_data[j].r;
        }

        if(task_id == 0){
            time[3] = MPI_Wtime();
            printf("Group 4: Inverse-2DFFT(row) takes %f s.\n", time[3] - time[2]);
        }
        /* Gather all the data */
        MPI_Gather(local_data, chunkSize*N, MPI_FLOAT,
                    data, chunkSize*N, MPI_FLOAT,
                    0, taskcomm);

        if(task_id == 0){
            time[4] = MPI_Wtime();
            printf("Group 4: Gathering data of Inverse-2DFFT(row) takes %f s.\n", time[4] - time[3]);
        }

        MPI_Scatter(data, chunkSize, column,
                    local_data, chunkSize, column,
                    0, taskcomm);

        if(task_id == 0){
            time[5] = MPI_Wtime();
            printf("Group 4: Scattering data of columns to each processor takes %f s.\n", time[5] - time[4]);
        }

        /* Inverse-2DFFT(column) for output file */
        for(i = 0; i < chunkSize; i++){
            for(j = 0; j < N; j++){
                /* FFT each column for im1 */
                temp_data[j].r = local_data[j][i];
                temp_data[j].i = 0;
            }

            c_fft1d(temp_data, N, 1);

            for(j = 0; j < N; j++)
                local_data[j][i] = temp_data[j].r;
        }

        if(task_id == 0){
            time[6] = MPI_Wtime();
            printf("Group 4: Inverse-2DFFT(column) takes %f s.\n", time[6] - time[5]);
        }

        /* Gather all the columns of output file from each rank */
        MPI_Gather(local_data, chunkSize, column,
                    data, chunkSize, column,
                    0, taskcomm);

        if(task_id == 0){
            time[7] = MPI_Wtime();
                printf("Group 4: Gathering data of Inverse-2DFFT(column) takes %f s.\n", time[7] - time[6]);

            writeFile(data);
            time[8] = MPI_Wtime();
            printf("Group 4: Writing file to out_1 takes %f s.\n", time[8] - time[7]);
			
			comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[0];
			comp_time += time[6] - time[5] + time[3] - time[2];
        }
        MPI_Comm_free(&t3_t4_comm);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    if(world_id == 0){
        end = MPI_Wtime();
		printf("Total communication time of 2D convolution using MPI task parallel takes %f s.\n", comm_time);
		printf("Total computing time of 2D convolution using MPI task parallel takes %f s.\n", comp_time);
		printf("Total running time without loading/writing of 2D convolution using MPI task parallel takes %f s.\n", comm_time + comp_time);
        printf("Total running time of 2D convolution using MPI task parallel takes %f s.\n", end - start);
    }

    /* Free vector and task comm */
    MPI_Type_free(&column);
    MPI_Type_free(&col);
    MPI_Comm_free(&taskcomm);
    MPI_Finalize();
    return 0;
}