Example #1
0
int main(int argc, char** argv) {
    int rank;
    assertz("MPI_Init", MPI_Init(&argc, &argv));
    assertz("MPI_Comm_rank", MPI_Comm_rank(MPI_COMM_WORLD, &rank));

    const int data_size = 2;
    int data_replace[data_size];
    const int tag_1 = 12345;
    const int tag_2 = 67890;
    const int jack = 0;
    const int jill = 1;
    MPI_Status stat;
    if (rank == jack) {
        data_replace[0] = 11;
        data_replace[1] = 12;
        MPI_Sendrecv_replace(&data_replace, data_size, MPI_INT, jill, tag_1, jill, tag_2, MPI_COMM_WORLD, &stat);
    }
    if (rank == jill) {
        data_replace[0] = 21;
        data_replace[1] = 22;
        MPI_Sendrecv_replace(&data_replace, data_size, MPI_INT, jack, tag_2, jack, tag_1, MPI_COMM_WORLD, &stat);
    }
    if (rank < 2) {
        printf("rank = %d : data_replace[] = {%d, %d} \n", rank, data_replace[0], data_replace[1]);
    }

    assertz("MPI_Finalize", MPI_Finalize());
    exit(EXIT_SUCCESS);
}
Example #2
0
int main(int argc, char *argv[])
{
	int *matrix_a;
	int *matrix_b;
	int *matrix_c;

	const char *matrix_a_filename = argv[1];
	const char *matrix_b_filename = argv[2];
	const char *matrix_c_filename = argv[3];

	MPI_Comm matrix_comm;

	MPI_Init(&argc, &argv);

	create_matrix_comm(MPI_COMM_WORLD, &matrix_comm);
	MPI_Comm_size(matrix_comm, &size);
	MPI_Comm_rank(matrix_comm, &rank);

	compute_matrixes_variables(matrix_a_filename, matrix_comm);

	alloc_submatrix_buffer(&matrix_a);
	alloc_submatrix_buffer(&matrix_b);
	alloc_submatrix_buffer(&matrix_c);

	distribute_matrix(matrix_a_filename, matrix_a, matrix_comm);
	distribute_matrix(matrix_b_filename, matrix_b, matrix_comm);

	/* The actual cannon algorithms */
	int row_source, row_dst;
	int col_source, col_dst;
	MPI_Cart_shift(matrix_comm, 0, -1, &row_source, &row_dst);
	MPI_Cart_shift(matrix_comm, 1, -1, &col_source, &col_dst);
	int i;
	for (i = 0; i < pp_dims; i++) {
		compute_matrix_mul(matrix_a, matrix_b, matrix_c, N);
		MPI_Sendrecv_replace(matrix_a, sub_n * sub_n, MPI_INT,
				     row_source, MPI_ANY_TAG, row_dst, MPI_ANY_TAG,
				     matrix_comm, MPI_STATUS_IGNORE);

		MPI_Sendrecv_replace(matrix_b, sub_n * sub_n, MPI_INT,
				     col_source, MPI_ANY_TAG, col_dst, MPI_ANY_TAG,
				     matrix_comm, MPI_STATUS_IGNORE);
	}


	write_result(matrix_c_filename, matrix_c, matrix_comm);

	free(matrix_a);
	free(matrix_b);
	free(matrix_c);

	MPI_Comm_free(&matrix_comm);

	MPI_Finalize();
	return 0;
}
Example #3
0
void MatrixMatrixMultiply(double ***a, double ***b, double ***c, int mra, int
        mca, int mrb, int mcb, int *ra, int *ca, int *rb, int *cb, MPI_Comm
        comm)
{
    /*from the teaching book */
    int i, j;
    int num_procs, dims[2], periods[2];
    int myrank, my2drank, mycoords[2];
    int uprank, downrank, leftrank, rightrank, coords[2];
    int shiftsource, shiftdest;
    MPI_Status status; 
    MPI_Comm comm_2d;

    MPI_Comm_size(comm, &num_procs);
    MPI_Comm_rank(comm_2d, &my2drank);
    
    dims[0] = dims[1] = 0;
    MPI_Dims_create(num_procs, 2, dims);
    periods[0]= periods[1] = 1;

    MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
    MPI_Comm_rank(comm_2d, &my2drank);
    MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);

    MPI_Cart_shift(comm_2d, 1, -1, &rightrank, &leftrank);
    MPI_Cart_shift(comm_2d, 0, -1, &downrank, &uprank);

    int ia = my2drank;
    int ib = my2drank;

    MPI_Cart_shift(comm_2d, 1, -mycoords[0], &shiftsource, &shiftdest);
    MPI_Sendrecv_replace((*a)[0], mra*mca, MPI_DOUBLE, shiftdest, 1,
            shiftsource, 1, comm_2d, &status);
    MPI_Sendrecv_replace(&ia, 1, MPI_INT, shiftdest, 1, shiftsource, 1,
            comm_2d, &status);

    MPI_Cart_shift(comm_2d, 0, -mycoords[1], &shiftsource, &shiftdest);
    MPI_Sendrecv_replace((*b)[0], mrb*mcb, MPI_DOUBLE, shiftdest, 1,
            shiftsource, 1, comm_2d, &status);  
    MPI_Sendrecv_replace(&ib, 1, MPI_INT, shiftdest, 1, shiftsource, 1,
            comm_2d, &status);

    for (i=0; i<dims[0]; i++){
        MatrixMultiply(ra[ia], ca[ia], rb[ib], cb[ib], *a, *b, c); /* c=c + a*b */

        MPI_Sendrecv_replace((*a)[0], mra*mca, MPI_DOUBLE, leftrank, 1,
                rightrank, 1, comm_2d, &status);
        MPI_Sendrecv_replace((*b)[0], mrb*mcb, MPI_DOUBLE, uprank, 1, downrank,
                1, comm_2d, &status);

        MPI_Sendrecv_replace(&ia, 1, MPI_INT, leftrank, 1, rightrank, 1,
                comm_2d, &status);
        MPI_Sendrecv_replace(&ib, 1, MPI_INT, uprank, 1, downrank, 1,
                comm_2d, &status);
    }

    MPI_Comm_free(&comm_2d);    
}
Example #4
0
File: fox.c Project: The-coders/fox
/* algoritmo de Fox para multiplicar matrices cuadradas de n x n */
void Fox(struct grid_info *grid, int local_n, 
         matrix_type** local_A, matrix_type** local_B, matrix_type** local_C)
{
    int stage;
    int i, j;
    const int local_n_sq = local_n * local_n;
    const int src  = (grid->my_row + 1) % grid->ppside;
    const int dest = (grid->my_row + grid->ppside - 1) % grid->ppside;
    int bcast_root;
    MPI_Status status;
    matrix_type **temp_A = matrix_new(local_n, local_n);

    for (stage = 0; stage < grid->ppside; ++stage) {
        bcast_root = (grid->my_row + stage) % grid->ppside;
        if (bcast_root == grid->my_col) {
            MPI_Bcast(*local_A, local_n * local_n, MPI_FLOAT, bcast_root, grid->row_comm);
            matrix_multiply_and_add(local_A, local_B, local_C, local_n, local_n, local_n);
        }
        else {
            MPI_Bcast(*temp_A,  local_n_sq, MPI_FLOAT, bcast_root, grid->row_comm);
            matrix_multiply_and_add(temp_A, local_B, local_C, local_n, local_n, local_n);
        }
        MPI_Sendrecv_replace(*local_B, local_n_sq, MPI_FLOAT, dest, 0, src, 0, grid->col_comm, &status);
    }
}
Example #5
0
void mpi_manager_2D::do_MPISendRecv(NumMatrix<double,2> &buff,
                                    int Destination) {
	//! Do a send-receive operation, where the send-buffer is overwritten
	/*! Origin and destination is the same in this case
	 */
	// Get size of buffer:
	int size = ((buff.getHigh(1) - buff.getLow(1) + 1)*
	            (buff.getHigh(0) - buff.getLow(0) + 1));

	// MPI_Request request[1] = {MPI_REQUEST_NULL};
	// MPI_Request request;
	MPI_Status status;

	//	int tag = rank;
	int SendTag = rank;
	int RecvTag = Destination;
	// int SendTag = rank + Destination;
	// int RecvTag = Destination + rank;


	// Now do the communication:
	MPI_Sendrecv_replace((double *) buff, size, MPI_DOUBLE, Destination,
	                     SendTag, Destination, RecvTag,
	                     comm2d, &status);


	// MPI_Waitall(1, request);
	

}
Example #6
0
File: ring.c Project: pheenyx/HPC
int main(int argc, char** argv) {

    MPI_Init(&argc, &argv);

    int size, rank, i, dest, source, sum, temp_val;
    MPI_Status* status;

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    dest = (rank + 1) % size;
    source = (rank - 1) % size;
    sum = temp_val = rank; 
    for(i = 1; i < size; i++){
        MPI_Sendrecv_replace(&temp_val, 1, MPI_INT, dest, 0, source, 0, MPI_COMM_WORLD, status); 
        sum += temp_val ;
    }

    printf("Process %d: %d\n", rank, sum);

    MPI_Finalize();

    return EXIT_SUCCESS;

}
double* multiply_matrix_by_vector(double* matrix, double* vector) {
	int vector_length = c_recvcounts[rank];
	int offset = c_displs[rank];
	int incoming_process_data = 0;

	double* result = (double*) calloc(vector_length, sizeof(double));
	double* v = (double*) calloc(N/size + 1, sizeof(double));
	for (int i = 0; i < vector_length; i++) {
		v[i] = vector[i];
	}
	for (int process = 0; process < size; process++) {
		// index of current part of vector
		incoming_process_data = (rank + process) % size;

		for (int i = 0; i < c_recvcounts[rank]; i++) {
			for (int j = 0; j < c_recvcounts[incoming_process_data]; j++) {
				result[i] += matrix[i * N + j + c_displs[incoming_process_data]] * v[j];
			}
		}
		// switch vector, (vector length, vector offset) between processes
		MPI_Sendrecv_replace(v, N/size + 1, MPI_DOUBLE, (rank+1) % size, TAG_SEND_MATRIX,
			(rank-1) % size, TAG_SEND_MATRIX, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}
	return result;
}
Example #8
0
int MPIX_Sendrecv_replace_x(void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int sendtag,
                            int source, int recvtag, MPI_Comm comm, MPI_Status *status)
{
    int rc = MPI_SUCCESS;

    if (likely (count <= bigmpi_int_max )) {
        rc = MPI_Sendrecv_replace(buf, (int)count, datatype, dest, sendtag, source, recvtag, comm, status);
    } else {
        MPI_Datatype newtype;
        BigMPI_Type_contiguous(0,count, datatype, &newtype);
        MPI_Type_commit(&newtype);
        rc = MPI_Sendrecv_replace(buf, 1, newtype, dest, sendtag, source, recvtag, comm, status);
        MPI_Type_free(&newtype);
    }
    return rc;
}
Example #9
0
void mpi_manager_2D::do_MPISendRecv(NumMatrix<double,2> &buff,
                                    int Source, int Destination) {
	//! Do a send-receive operation, where the send-buffer is overwritten
	/*! Get data from somewhere and send own data somewhere else. The original
	  data will be overwritten
	 */
	// Get size of buffer:
	int size = ((buff.getHigh(1) - buff.getLow(1) + 1)*
	            (buff.getHigh(0) - buff.getLow(0) + 1));

	// MPI_Request request[1] = {MPI_REQUEST_NULL};
	// MPI_Request request;
	MPI_Status status;

	//	int tag = rank;
	int SendTag = rank;
	int RecvTag = Source;
	// int SendTag = rank + Destination;
	// int RecvTag = Destination + rank;


	// Now do the communication:
	MPI_Sendrecv_replace((double *) buff, size, MPI_DOUBLE, Destination,
	                     SendTag, Source, RecvTag,
	                     comm2d, &status);


	// MPI_Waitall(1, request);
	

}
Example #10
0
FORT_DLL_SPEC void FORT_CALL mpi_sendrecv_replace_ ( void*v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *ierr ){

#ifndef HAVE_MPI_F_INIT_WORKS_WITH_C
    if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; }
#endif

    if (v9 == MPI_F_STATUS_IGNORE) { v9 = (MPI_Fint*)MPI_STATUS_IGNORE; }
    *ierr = MPI_Sendrecv_replace( v1, *v2, (MPI_Datatype)(*v3), *v4, *v5, *v6, *v7, (MPI_Comm)(*v8), (MPI_Status *)v9 );
}
Example #11
0
/*-------------------------------------------------------------------------------*/
void OneStepCirculation(int step)
{
 MPI_Status   status;
 
 MPI_Sendrecv_replace(A_Slice, SIZE * LOCAL_SIZE, MPI_DOUBLE, ((Me - 1) + NbPE) % NbPE, 0,
                      (Me + 1) % NbPE , 0, MPI_COMM_WORLD, &status);

/******************************** TO DO ******************************************/
}
Example #12
0
File: grid.c Project: Thundzz/TDP
void prod_matrix(int N, int Nb, int myrank,
	double* bl_a, double* bl_b, double* bl_c,
	MPI_Comm comm_grid, MPI_Comm comm_col, MPI_Comm comm_row)
{
	int k;
	double my_a[Nb*Nb];
	int coords[2];
	MPI_Status st;

	int gd = N/Nb;
	for (int i = 0; i < Nb*Nb; ++i)
	{
		my_a[i] = bl_a[i];
	}

	MPI_Cart_coords(comm_grid, myrank, 2, coords);
	int sndto = (((coords[0]-1)%gd) +gd) %gd;
	int recvfrom = (coords[0]+1)%gd;
	int myrow = coords[0];
	int mycol = coords[1];

	for (k = 0; k < gd; k++)
	{	
		/* If I am i+k%N proc of the line
		 * 	Bcast_line(A[i][i+k%N]) 
		 * Else
		 * 	recv(A) from the i+k%N proc of the line
		 */

		 if(mycol == (myrow+k)%gd)
		 {
		 	for (int i = 0; i < Nb*Nb; ++i)
		 	{
		 		bl_a[i] = my_a[i];
		 	}
		 }
		MPI_Bcast(bl_a, Nb*Nb, MPI_DOUBLE, (myrow+k)%gd, comm_row);

		cblas_dgemm_scalaire(Nb, bl_a, bl_b, bl_c);  //Cij = A[i][i+k%N]*B[i+k%N][j]
			/* send(B) to upper neighbour
			 */
		MPI_Sendrecv_replace(bl_b, Nb*Nb, MPI_DOUBLE, sndto, 0, 
			recvfrom, 0, comm_col, &st);
	}
}
Example #13
0
void Fox(
         int              n         /* in  */,
         GRID_INFO_T*     grid      /* in  */,
         LOCAL_MATRIX_T*  local_A   /* in  */,
         LOCAL_MATRIX_T*  local_B   /* in  */,
         LOCAL_MATRIX_T*  local_C   /* out */) {
    
    LOCAL_MATRIX_T*  temp_A; /* Storage for the sub-    */
    /* matrix of A used during */
    /* the current stage       */
    int              stage;
    int              bcast_root;
    int              n_bar;  /* n/sqrt(p)               */
    int              source;
    int              dest;
    MPI_Status       status;
    
    n_bar = n/grid->q;
    Set_to_zero(local_C);
    
    /* Calculate addresses for circular shift of B */
    source = (grid->my_row + 1) % grid->q;
    dest = (grid->my_row + grid->q - 1) % grid->q;
    
    /* Set aside storage for the broadcast block of A */
    temp_A = Local_matrix_allocate(n_bar);
    
    for (stage = 0; stage < grid->q; stage++) {
        bcast_root = (grid->my_row + stage) % grid->q;
        if (bcast_root == grid->my_col) {
            MPI_Bcast(local_A, 1, local_matrix_mpi_t,
                      bcast_root, grid->row_comm);
            Local_matrix_multiply(local_A, local_B,
                                  local_C);
        } else {
            MPI_Bcast(temp_A, 1, local_matrix_mpi_t,
                      bcast_root, grid->row_comm);
            Local_matrix_multiply(temp_A, local_B,
                                  local_C);
        }
        MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t,
                             dest, 0, source, 0, grid->col_comm, &status);
    } /* for */
    
} /* Fox */
Example #14
0
void switchbuff(float *buff, int neighbor, int ndata)
{
  MPI_Status status;
  //  static int tag=0;
  int tag=0;
  int mpi_rank;
  

  FILE *ftest;
  char fnt[256];
  int dum=1;


  MPI_Barrier(MPI_COMM_WORLD);


  MPI_Sendrecv_replace(buff,ndata,MPI_FLOAT,neighbor,tag,neighbor,tag,MPI_COMM_WORLD,&status);
   //tag++;
}
Example #15
0
int main (int argc, char **argv) {
	// initialize MPI
	MPI_Init (&argc, &argv);

	// we have to remember the number of PEs
	int numpes;
	MPI_Comm_size (MPI_COMM_WORLD, &numpes);

	//for this we need 2 PEs
	assert(numpes == 2);

	// which rank does this process have?
	int myid;
	MPI_Comm_rank (MPI_COMM_WORLD, &myid);

	// deadlock avoidance: PE 0 sends and recieves using the same function call, PE 1 uses
	// its own buffer to avoid blocking on send.
	if (myid == 0) {
		// send message to 1, wait for message from 1
		char buf[10000];
		MPI_Status stat;

		MPI_Sendrecv_replace (buf, 10000, MPI_CHAR, 1, 0, 1, 0,MPI_COMM_WORLD, &stat);
		printf ("0: done\n");
	} else {
		// send message to 0, wait for message from 0
		char buf[10000];
		char intermediate_buffer[10000 + MPI_BSEND_OVERHEAD];
		MPI_Buffer_attach (&intermediate_buffer, 10000 + MPI_BSEND_OVERHEAD);

		MPI_Status stat;
		MPI_Bsend (buf, 10000, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
		// we can use buf again, as intermediate_buffer will take care of buffering
		MPI_Recv (buf, 10000, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &stat);
		printf ("1: done\n");
	}
	MPI_Finalize ();

	return EXIT_SUCCESS;
}
Example #16
0
void mpi_sendrecv_replace_f(char *buf, MPI_Fint *count, MPI_Fint *datatype,
			    MPI_Fint *dest, MPI_Fint *sendtag,
			    MPI_Fint *source, MPI_Fint *recvtag,
			    MPI_Fint *comm, MPI_Fint *status, MPI_Fint *ierr)
{
   MPI_Datatype c_type = MPI_Type_f2c(*datatype);
   MPI_Comm c_comm;
   MPI_Status c_status;

   c_comm = MPI_Comm_f2c (*comm);
   
   *ierr = OMPI_INT_2_FINT(MPI_Sendrecv_replace(OMPI_F2C_BOTTOM(buf),
                                                OMPI_FINT_2_INT(*count),
                                                c_type, 
                                                OMPI_FINT_2_INT(*dest), 
                                                OMPI_FINT_2_INT(*sendtag), 
                                                OMPI_FINT_2_INT(*source), 
                                                OMPI_FINT_2_INT(*recvtag),
                                                c_comm, &c_status));
    if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr) &&
        !OMPI_IS_FORTRAN_STATUS_IGNORE(status)) {
      MPI_Status_c2f(&c_status, status);
   }
}
main(int argc, char * argv[]) {
	srand(time(NULL));
	int 	my_rank, size, stage, temp;
	int 	n, local_n, i, j, k, source, dest, q, ind;
	float	*matrix_A;
	float	*matrix_B;
	float	*matrix_C;
	float   *local_A;
	float   *local_B;
	float	*local_C;
	double	start, end;
	MPI_Datatype	column;
	MPI_Status	status;
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);

	if (my_rank == 0) {
		printf("\t\t****************************************************\n");
		printf("\t\t*   Descomposición en Bloques de Rayas por Filas   *\n");
		printf("\t\t****************************************************\n\n");
	}

	if (my_rank == 0) {
		matrix_A = (float *)malloc(MAX*MAX*sizeof(float));
		matrix_B = (float *)malloc(MAX*MAX*sizeof(float));
		matrix_C = (float *)malloc(MAX*MAX*sizeof(float));
		if (argc == 2) {
			sscanf(argv[1], "%d", &n);
		} else if (my_rank == 0) {
			printf("¿ Cuál es el orden de las matrices ? \n");
			scanf("%d", &n);
		}
		local_n = n / size;
		/* Se lee la matriz A */
		Read_matrix ("Ingrese A :", matrix_A, n);
		Print_matrix ("Se leyó A :", matrix_A, n);

		/* Se lee la matriz B */
		Read_matrix ("Ingrese B :", matrix_B, n);
		Print_matrix ("Se leyó B :", matrix_B, n);

	}

	MPI_Bcast(&local_n, 1, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

	local_A = (float *) malloc(MAX*MAX*sizeof(float));
	local_B = (float *) malloc(MAX*MAX*sizeof(float));
	local_C = (float *) malloc(MAX*MAX*sizeof(float));

/******************************************************************************/
/* Distribuir los bloques de filas de A y los bloques de filas de B entre 
   los procesadores del comunicador global */

	// Enviar los bloques de filas de A y B a todos los procesadores
	MPI_Scatter(matrix_A, local_n*n, MPI_FLOAT, local_A, local_n*n, MPI_FLOAT, 0, MPI_COMM_WORLD);
	MPI_Scatter(matrix_B, local_n*n, MPI_FLOAT, local_B, local_n*n, MPI_FLOAT, 0, MPI_COMM_WORLD);
/*****************************************************************************/
/* Algoritmo de Descomposición por Bloques de Rayas por filas para la matriz A 
   y por columnas para la matriz B */
	
	source = (my_rank - 1 + size) % size;
	dest = (my_rank + 1) % size;
	q = n / local_n;
	start = MPI_Wtime();
	for (stage = 0; stage < q; stage++) {
		ind = (my_rank - stage + size) % size;
		for (j = 0; j < local_n; j++) {
			for (i = 0; i < n; i++) {
				for (k = 0; k < local_n; k++) {
					local_C[i + j*n] += local_A[local_n*ind + k + j*n] * local_B[i + k*n];
				}
			}
		}
		MPI_Sendrecv_replace(local_B, local_n*n, MPI_FLOAT, dest, 0, source, 0, MPI_COMM_WORLD, &status);
	}
	end = MPI_Wtime();

/*****************************************************************************/

	// Recolectar los bloques local_C de cada procesador en matrix_C en le procesador 0
	MPI_Gather(local_C, local_n*n, MPI_FLOAT, matrix_C, local_n*n, MPI_FLOAT, 0, MPI_COMM_WORLD);

	if (my_rank == 0) {
		Print_matrix ("El producto es C : ", matrix_C, n);
	}

	if (my_rank == 0)
		printf("n : %d\nDesc. por filas : %f segundos.\n", n, end - start);
	
	MPI_Finalize();

} /* main */
void __entry
stencil_thread( void* p )
{
	my_args_t* pargs = (my_args_t*)p;

	int i,j;
	int NI = pargs->ni;
	int NJ = pargs->nj;
	int di = pargs->di;
	int dj = pargs->dj;
	int niter = pargs->niter;
	float* A = pargs->A;
	float* B = pargs->B;
	float w0 = pargs->w0;
	float w1 = pargs->w1;
	float w2 = pargs->w2;
	float w3 = pargs->w3;
	float w4 = pargs->w4;

	int myrank_2d, mycoords[2];
	int dims[2] = {di, dj};
	int periods[2] = {1, 1}; // Periodic communication but ignoring edge copy where irrelvant

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);

	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm comm_2d;
	MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
	MPI_Comm_rank(comm_2d, &myrank_2d);
	MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords);

	int x = mycoords[0];
	int y = mycoords[1];

	// ranks of neighbors
	int north, south, west, east;
	MPI_Cart_shift(comm_2d, 0, 1, &west, &east);
	MPI_Cart_shift(comm_2d, 1, 1, &north, &south);

	// local stencil sizes with padding
	int ni = (NI-2) / di + 2;
	int nj = (NJ-2) / dj + 2;

	// Load the initial values
	void* memfree = coprthr_tls_sbrk(0);
	float* a = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float));
	float* b = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float));
	float* nsbuf = (float*)coprthr_tls_sbrk(ni*sizeof(float));
	float* webuf = (float*)coprthr_tls_sbrk((nj-2)*sizeof(float));
	long long* srcadr;
	long long* dstadr;
	long long* nsend = (long long*)(nsbuf + ni);

	// Copy initial conditions (2D DMA would be better)
	for (j=0; j<nj; j++) e_dma_copy(a+j*ni, A + (y*(ni-2)+j)*NI+x*(nj-2), ni*sizeof(float));

	// Initial conditions
//	if(y==0) for (i=0; i<ni-2; i++) a[i] = -2.0f;
//	if(y==dj) for (i=0; i<ni-2; i++) a[(nj-1)*ni+i] = 1.0f;
//	if(x==di) for (j=0; j<nj-2; j++) a[(j+2)*ni-1] = -1.0f;
//	if(x==0) for (j=0; j<nj-2; j++) a[(j+1)*ni] = 2.0f;

	// Copy "a" into "b" (only need fixed borders would be better)
	for (i=0; i<ni*nj; i++) b[i] = a[i];

	while (niter--) {

/*		for (j=1; j<nj-1; j++) {
			for (i=1; i<ni-1; i++) {
				b[j*ni+i] = w0*a[j*ni+i-1] + w1*a[j*ni+i] + w2*a[j*ni+i+1] + w3*a[j*ni+i-ni] + w4*a[j*ni+i+ni];
			}
		}*/

		for (j=0; j<nj-2; j+=4)
		{
			float a14 = a[(j+1)*ni+0];
			float a15 = a[(j+1)*ni+1];
			float a24 = a[(j+2)*ni+0];
			float a25 = a[(j+2)*ni+1];
			float a34 = a[(j+3)*ni+0];
			float a35 = a[(j+3)*ni+1];
			float a44 = a[(j+4)*ni+0];
			float a45 = a[(j+4)*ni+1];
			for (i=0; i<ni-2; i+=4)
			{
				float a01 = a[(j+0)*ni+i+1];
				float a02 = a[(j+0)*ni+i+2];
				float a03 = a[(j+0)*ni+i+3];
				float a04 = a[(j+0)*ni+i+4];
				float a10 = a14;
				float a11 = a15;
				float a12 = a[(j+1)*ni+i+2];
				float a13 = a[(j+1)*ni+i+3];
				a14 = a[(j+1)*ni+i+4];
				a15 = a[(j+1)*ni+i+5];
				float a20 = a24;
				float a21 = a25;
				float a22 = a[(j+2)*ni+i+2];
				float a23 = a[(j+2)*ni+i+3];
				a24 = a[(j+2)*ni+i+4];
				a25 = a[(j+2)*ni+i+5];
				float a30 = a34;
				float a31 = a35;
				float a32 = a[(j+3)*ni+i+2];
				float a33 = a[(j+3)*ni+i+3];
				a34 = a[(j+3)*ni+i+4];
				a35 = a[(j+3)*ni+i+5];
				float a40 = a44;
				float a41 = a45;
				float a42 = a[(j+4)*ni+i+2];
				float a43 = a[(j+4)*ni+i+3];
				a44 = a[(j+4)*ni+i+4];
				a45 = a[(j+4)*ni+i+5];
				float a51 = a[(j+5)*ni+i+1];
				float a52 = a[(j+5)*ni+i+2];
				float a53 = a[(j+5)*ni+i+3];
				float a54 = a[(j+5)*ni+i+4];

				b[(j+1)*ni+i+1] = fma(w4,a21,fma(w3,a01,fma(w2,a12,fma(w1,a11,w0*a10))));
				b[(j+1)*ni+i+2] = fma(w4,a22,fma(w3,a02,fma(w2,a13,fma(w1,a12,w0*a11))));
				b[(j+1)*ni+i+3] = fma(w4,a23,fma(w3,a03,fma(w2,a14,fma(w1,a13,w0*a12))));
				b[(j+1)*ni+i+4] = fma(w4,a24,fma(w3,a04,fma(w2,a15,fma(w1,a14,w0*a13))));
				b[(j+2)*ni+i+1] = fma(w4,a31,fma(w3,a11,fma(w2,a22,fma(w1,a21,w0*a20))));
				b[(j+2)*ni+i+2] = fma(w4,a32,fma(w3,a12,fma(w2,a23,fma(w1,a22,w0*a21))));
				b[(j+2)*ni+i+3] = fma(w4,a33,fma(w3,a13,fma(w2,a24,fma(w1,a23,w0*a22))));
				b[(j+2)*ni+i+4] = fma(w4,a34,fma(w3,a14,fma(w2,a25,fma(w1,a24,w0*a23))));
				b[(j+3)*ni+i+1] = fma(w4,a41,fma(w3,a21,fma(w2,a32,fma(w1,a31,w0*a30))));
				b[(j+3)*ni+i+2] = fma(w4,a42,fma(w3,a22,fma(w2,a33,fma(w1,a32,w0*a31))));
				b[(j+3)*ni+i+3] = fma(w4,a43,fma(w3,a23,fma(w2,a34,fma(w1,a33,w0*a32))));
				b[(j+3)*ni+i+4] = fma(w4,a44,fma(w3,a24,fma(w2,a35,fma(w1,a34,w0*a33))));
				b[(j+4)*ni+i+1] = fma(w4,a51,fma(w3,a31,fma(w2,a42,fma(w1,a41,w0*a40))));
				b[(j+4)*ni+i+2] = fma(w4,a52,fma(w3,a32,fma(w2,a43,fma(w1,a42,w0*a41))));
				b[(j+4)*ni+i+3] = fma(w4,a53,fma(w3,a33,fma(w2,a44,fma(w1,a43,w0*a42))));
				b[(j+4)*ni+i+4] = fma(w4,a54,fma(w3,a34,fma(w2,a45,fma(w1,a44,w0*a43))));

			}
		}

		// north/south
		dstadr = (long long*)nsbuf;
		srcadr = (long long*)(b+ni);
		while (dstadr != nsend) *dstadr++ = *srcadr++; // second row
		MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, north, 1, south, 1, comm, &status);
		if (y!=dj-1) {
			dstadr = (long long*)(b+(nj-1)*ni);
			srcadr = (long long*)nsbuf;
			while (srcadr != nsend) *dstadr++ = *srcadr++; // last row
		}
		dstadr = (long long*)nsbuf;
		srcadr = (long long*)(b+(nj-2)*ni);
		while (dstadr != nsend) *dstadr++ = *srcadr++; // second to last row
		MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, south, 1, north, 1, comm, &status);
		if (y) {
			dstadr = (long long*)b;
			srcadr = (long long*)nsbuf;
			while (srcadr != nsend) *dstadr++ = *srcadr++; // first row
		}

		// west/east
		for (j=0; j<nj-2; j++) webuf[j] = b[(j+1)*ni+1]; // second column
		MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, west, 1, east, 1, comm, &status);
		if (x!=di-1) for (j=0; j<nj-2; j++) b[(j+2)*ni-1] = webuf[j]; // last column
		for (j=0; j<nj-2; j++) webuf[j] = b[(j+2)*ni-2]; // second to last column
		MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, east, 1, west, 1, comm, &status);
		if (x) for (j=0; j<nj-2; j++) b[(j+1)*ni] = webuf[j]; // first column

		float* tmp = b;
		b = a;
		a = tmp;
	}

	// Copy internal results
	for (j=1; j<nj-1; j++) e_dma_copy(B + (y*(ni-2)+j)*NI+x*(nj-2)+1, a+j*ni+1, (ni-2)*sizeof(float));

	coprthr_tls_brk(memfree);

	MPI_Finalize();
}
Example #19
0
__kernel void
my_thread( void* p) {

	my_args_t* pargs = (my_args_t*)p;

	int N = pargs->N, s = pargs->s, d = pargs->d; 
	float *ga = pargs->ga, *gb = pargs->gb, *gc = pargs->gc;
	int n = N/d;

	int myrank_2d, mycoords[2];
	int dims[2] = {d, d};
	int periods[2] = {1, 1};

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);

	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm comm_2d;
	MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
	MPI_Comm_rank(comm_2d, &myrank_2d);
	MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords);
	// Compute ranks of the up and left shifts
	int uprank, downrank, leftrank, rightrank;
	MPI_Cart_shift(comm_2d, 0, 1, &leftrank, &rightrank);
	MPI_Cart_shift(comm_2d, 1, 1, &uprank, &downrank);

	int x = mycoords[0];
	int y = mycoords[1];
	// this removes initial skew shift by reading in directly
	int skew = (x+y) % d;

	void* memfree = coprthr_tls_sbrk(0);
	float* a = (float*)coprthr_tls_sbrk(n*n*sizeof(float));
	float* b = (float*)coprthr_tls_sbrk(n*n*sizeof(float));
	float* c = (float*)coprthr_tls_sbrk(n*n*sizeof(float));

	e_dma_desc_t dma_c_read, dma_c_write, dma_a_read, dma_b_read;

#define DWORD_WRITE(desc,w,h,W,src,dst) \
	e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \
	0x0008, 0x0008, \
	w/2, h, \
	8, 4*(W-w+2), \
	(void*)src, (void*)dst, &desc)
#define DWORD_READ(desc,w,h,W,src,dst) \
	e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \
	0x0008, 0x0008, \
	w/2, h, \
	4*(W-w+2), 8, \
	(void*)src, (void*)dst, &desc)

int loop;
for(loop=0;loop<LOOP1;loop++) {

	int i,j,k,l;
	for (i=0; i<s; i++) {
		for (j=0; j<s; j++) {
			float* rgc = gc + ((i*N + y*n)*s + j)*N + x*n;
			DWORD_WRITE(dma_c_write,n,n,s*N,c,rgc);
			DWORD_READ(dma_c_read,n,n,s*N,rgc,c);
			// read C
			e_dma_start(&dma_c_read, E_DMA_0);
			e_dma_wait(E_DMA_0);
			for (k=0; k<s; k++) {
				float* rga = ga + ((i*N + y*n)*s + k)*N + skew*n;
				float* rgb = gb + ((k*N + skew*n)*s + j)*N + x*n;
				// read A and B
				DWORD_READ(dma_b_read,n,n,s*N,rgb,b);
				DWORD_READ(dma_a_read,n,n,s*N,rga,a);
				e_dma_start(&dma_b_read, E_DMA_0);
				e_dma_wait(E_DMA_0);
				e_dma_start(&dma_a_read, E_DMA_0);
				e_dma_wait(E_DMA_0);
				// transpose B
				int ji, ii;
				for (ji=0; ji<n-1; ji++) {
					for(ii=ji+1; ii<n; ii++) {
						int tmp = b[ji*n+ii];
						b[ji*n+ii] = b[ii*n+ji];
						b[ii*n+ji] = tmp;
					}
				}
				int loop;
				for (loop=0;loop<LOOP3;loop++) {
				// Get into the main computation loop
				for (l=1; l<d; l++) {
					int loop;
					for(loop=0;loop<LOOP2;loop++)
					MatrixMultiply(n, a, b, c);
					// Shift matrix a left by one and shift matrix b up by one
					MPI_Sendrecv_replace(a, n*n, MPI_FLOAT, leftrank, 1, rightrank, 1, comm_2d, &status);
					MPI_Sendrecv_replace(b, n*n, MPI_FLOAT, uprank, 1, downrank, 1, comm_2d, &status);
				}
				MatrixMultiply(n, a, b, c);
			} // end LOOP3
			}
			// write C
			e_dma_start(&dma_c_write, E_DMA_1);
			e_dma_wait(E_DMA_1);
		}
	}
} // end LOOP1

	coprthr_tls_brk(memfree);

	MPI_Finalize();

}
Example #20
0
	__kernel void
nbody_thread( void* p )
{
	my_args_t* pargs = (my_args_t*)p;

	int n = pargs->n;
	int cnt = pargs->cnt;
	unsigned int s_x, s_y, s_z, s_m;
	unsigned int page = 0;
	float dt = pargs->dt;
	float es = pargs->es;
	Particle *particles = pargs->p;
	ParticleV *state = pargs->v;

	int rank, size, npart, i;
	int left, right;

	MPI_Status status;
	MPI_Init(0,MPI_BUF_SIZE);
	MPI_Comm comm = MPI_COMM_THREAD;
	MPI_Comm_rank(comm, &rank);
	MPI_Comm_size(comm, &size);
	MPI_Cart_shift(comm, 0, 1, &left, &right);

	npart = n / size;

	void* memfree = coprthr_tls_sbrk(0);
	Particle* my_particles = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle));
	ParticleV* my_state = (ParticleV*)coprthr_tls_sbrk(npart*sizeof(ParticleV));
	Particle* sendbuf = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle));

	e_dma_copy(my_particles, particles + npart*rank, npart*sizeof(Particle));
	e_dma_copy(my_state, state + npart*rank, npart*sizeof(ParticleV));
	unsigned int rgba_black = 0x00000000;
	unsigned int rgba_white = 0x00ffffff;

	while (cnt--) {

		for (i=0; i<npart; i++) sendbuf[i] = my_particles[i];

		for (i=0; i<size; i++) {
			if (i) MPI_Sendrecv_replace(sendbuf, sizeof(Particle)/sizeof(float)*npart, MPI_FLOAT, left, 1, right, 1, comm, &status);
			ComputeAccel(my_particles, sendbuf, my_state, npart, es);
		}
		e_dma_copy(particles + npart*rank, my_particles, npart*sizeof(Particle));
		ComputeNewPos(my_particles, my_state, npart, dt);

		for(i = 0; i < npart; i++){
			s_x = (int) particles[i + npart*rank].x;
			s_y = (int) particles[i + npart*rank].y;
			if(s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){
				e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_black, 1 * BPP);
			}
			s_x = (int) my_particles[i].x;
			s_y = (int) my_particles[i].y;
			if(cnt  > 1 && s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){
				e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_white, 1 * BPP);
			}
		}

	}

	coprthr_tls_brk(memfree);

	MPI_Finalize();
}
Example #21
0
int main(int argc, char **argv) {

 
  int rank, M, j,i, *d_graph;
  int *local_matrix, *row_matrix, *col_matrix, *res_matrix, *rowIds, *colIds;
  int P, N, q, p_row, p_col;
  double start, finish;
  MPI_Status status;
 
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &P);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  //INPUT HANDLED BY THE ROOT PROCESSOR
  if (rank == ROOT){
    scanf("%d", &N);  
    q = check_fox_conditions(P,N);

    //Check's if the fox's conditions are met
    if(q == 0){
      MPI_Abort(MPI_COMM_WORLD, 0);
      return 1; //error
    }  

    d_graph = (int*)malloc((N*N) * sizeof(int));

    for(i=0; i < N; i++){
      for(j=0; j < N; j++){
	scanf("%d", &d_graph[GET_MTRX_POS(i,j,N)]);
	if (d_graph[GET_MTRX_POS(i,j,N)] == 0 && i != j) {
	  d_graph[GET_MTRX_POS(i,j,N)] = INF;
	}
      }
    }



    MPI_Bcast(&q, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);

    if(q > 1)
      divide_matrix( d_graph, N, q); 
      
  }
  else{
    MPI_Bcast(&q, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
  }
  //---------------COMMON------------------
   
  int lngth = N / q;


  local_matrix = (int*)malloc((lngth*lngth) * sizeof(int));
  row_matrix   = (int*)malloc((lngth*lngth) * sizeof(int));
  col_matrix   = (int*)malloc((lngth*lngth) * sizeof(int));
  res_matrix   = (int*)malloc((lngth*lngth) * sizeof(int));
  
  if(q>1)
    chnkd_MPI_Recv(local_matrix, lngth*lngth, MPI_INT, 0);
  else
    local_matrix = d_graph;
    
  p_row = ( rank / q );
  p_col = ( rank % q );
    
  //CREATE COMMUNICATORS 
  MPI_Group MPI_GROUP_WORLD;
  MPI_Comm_group(MPI_COMM_WORLD, &MPI_GROUP_WORLD);
  MPI_Group row_group, col_group;
  MPI_Comm row_comm, col_comm, grid_comm;
  int tmp_row, tmp_col, proc;
  int row_process_ranks[q], col_process_ranks[q];
    
  for(proc = 0; proc < q; proc++){   
    row_process_ranks[proc] = (p_row * q) + proc;
    col_process_ranks[proc] = ((p_col + proc*q) %(q*q));
  }    
  radixsort(col_process_ranks, q);
  radixsort(row_process_ranks, q);

  MPI_Group_incl(MPI_GROUP_WORLD, q, row_process_ranks, &row_group);  
  MPI_Group_incl(MPI_GROUP_WORLD, q, col_process_ranks, &col_group);  
     
  MPI_Comm_create(MPI_COMM_WORLD, row_group, &row_comm);  
  MPI_Comm_create(MPI_COMM_WORLD, col_group, &col_comm);  

  if ((rank / q) == (rank % q)) {
      memcpy(row_matrix, local_matrix, (lngth*lngth) * sizeof(int));
  }
  int ln,d,flag;
  int step, rotation_src, rotation_dest, src;
  int count = 0;
  memcpy(res_matrix, local_matrix, (lngth*lngth) * sizeof(int));
  rotation_src = (p_row + 1) % q;
  rotation_dest = ((p_row - 1) + q) % q;
  ln = (lngth*q) << 1;
  start = MPI_Wtime();  

  for (d = 2; d < ln; d = d << 1) {
    memcpy(col_matrix, local_matrix, (lngth*lngth) * sizeof(int));
    for ( step = 0;  step < q;  step++) {
      src = (p_row +  step) % q;
      count++;
      if (src == p_col) {
	MPI_Bcast(local_matrix, lngth*lngth, MPI_INT, src, row_comm);
	floyd_warshall( local_matrix, col_matrix, res_matrix, lngth);
      } else {
	MPI_Bcast(row_matrix, lngth*lngth, MPI_INT, src, row_comm);
	floyd_warshall( row_matrix, col_matrix, res_matrix, lngth);
      }  
      if( step < q-1) 
        MPI_Sendrecv_replace(col_matrix, lngth*lngth, MPI_INT, rotation_dest, STD_TAG,rotation_src, STD_TAG, col_comm, MPI_STATUS_IGNORE);
  	
    }
    memcpy(local_matrix, res_matrix, (lngth*lngth) * sizeof(int));
  }
  
  
  int *sol;
  sol = malloc(N*N*sizeof(int));  
  
  MPI_Gather(res_matrix, lngth*lngth, MPI_INT, sol,  lngth*lngth, MPI_INT, 0, MPI_COMM_WORLD);
  
  if (rank == 0) {
    finish = MPI_Wtime();
    printf("Tempo de execução %f\n",finish - start);
  }
 
  if (rank == 0) {
    int row, col, pos_x, pos_y, pos, tmp_y, tmp_x;

    for (i = 0; i < P; i++) {
      pos_x = i / q;
      pos_y = i % q;
      pos = i * lngth*lngth;

      for (row = 0; row < lngth; row++) {
	for (col = 0; col < lngth; col++) {
          tmp_x = GET_MTRX_POS(pos_x,row,lngth);
          tmp_y = GET_MTRX_POS(pos_y,col,lngth);
          
	  if (sol[GET_MTRX_POS(row,col,lngth) + pos] == INF)
	    d_graph[GET_MTRX_POS(tmp_x,tmp_y,N)] = 0;
	  else
	    d_graph[GET_MTRX_POS(tmp_x,tmp_y,N)] = sol[GET_MTRX_POS(row,col,lngth) + pos];
	}
      }
    }
    prints_matrix(d_graph,N);
  }
  
  MPI_Finalize();
  return 0;
}
Example #22
0
int
main (int argc, char **argv)
{
   int row_receive, col_receive;
   int world_rank, world_size;
   int  source, destination;
   double start_time, end_time;
   int i,j;
   int row_i, column_i, cycle;
   int rank2;  
 
   MPI_Comm comm2;
      

   MPI_Init (&argc, &argv);
   MPI_Comm_rank (MPI_COMM_WORLD, &world_rank);
   MPI_Comm_size (MPI_COMM_WORLD, &world_size);
   
   double root_p;                  /* sqrt of no of processors */
   root_p = sqrt ((double) world_size);
    
   if (NRA % (int) root_p != 0)
     {
      printf ("Please enter a processor count which a perfect square and a multiple ");
      MPI_Abort (MPI_COMM_WORLD, 1);
     }
    
   int sub_matrix = NRA / root_p;
   
   /* Need to create a grid of root p x root p processors */
   dims [0] = (int) root_p;
   dims [1] = (int) root_p;

   period [0] = 1;
   period [1] = 1;

   /* Now Matrix is made up of sub-matrices of size n/root p */

   double sub_A [sub_matrix][sub_matrix];   
   double sub_B [sub_matrix][sub_matrix];   
   double sub_C [sub_matrix][sub_matrix];   
   double sub_CT [sub_matrix][sub_matrix];   
   
   /* Now creating a cartesian topology */
   
   MPI_Cart_create (MPI_COMM_WORLD, 2, dims, period, 0, &comm2);     
   
   /* NOw getting a new rank */
   MPI_Comm_rank (comm2, &world_rank);
   
   /*Determine process co ordinate based on rank */
   MPI_Cart_coords (comm2, world_rank, 2, coordinates);

   

   Init_zero_Mat (sub_matrix, sub_C);

   if (world_rank == 0)
     {
        Matrix_init (NRA, NRA, A);
        Matrix_init (NRA, NRA, B);
  
        //print_Mat (NRA, A);
        //print_Mat (NRA, B);
     
        Init_zero_Mat (sub_matrix, sub_C); 

        /* Let us send each portion of A and B and start multiplying */         
         start_time = MPI_Wtime ();

         for (i = 0; i < root_p; i++)
           {
              for (j = 0; j < root_p; j++)
                 {
                    if ( i != 0 || j != 0)
                      {
                         send_coordinates [0] = i;
                         send_coordinates [1] = j;
                         row_i = -1;
                         int k;
                         for (k = i * sub_matrix; k < i * sub_matrix + sub_matrix; k++)
                           {
                             column_i = 0;
                             row_i ++;
                             int l;
                             for (l = j *sub_matrix; l < j * sub_matrix + sub_matrix; l++)
                              {
                                       
                                 sub_A[row_i][column_i] = A[k][l];
                                 sub_B[row_i][column_i] = B[k][l];
                                 column_i++;
                              } 
                            }
                            
                            /* Make the co ordinate reference to column and send it to processor pij */
                            send_coordinates [0] = i;
                            send_coordinates [1] = ((j - i) < 0) ? (j-i) + root_p : (j-i);
                            MPI_Cart_rank (comm2, send_coordinates, &rank2);
                            MPI_Send (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE, rank2, 1, comm2);
                            send_coordinates [0] = ((i-j) < 0) ? (i-j) + root_p : i-j;
                            send_coordinates [1] = j;
                        
                            
                            MPI_Cart_rank (comm2, send_coordinates, &rank2);
                            MPI_Send (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE, rank2, 2, comm2);
 

                      }
                   }
              } 

             
          /* NOws send  to process 0 */
          for (i =0 ; i<sub_matrix; i++)
           {
            for ( j = 0; j < sub_matrix; j++)
              {
                 sub_A[i][j] = A[i][j];
                 sub_B[i][j] = B[i][j];
               }
            }
     
           /* calculate c for matrix in process 0 */
           /* Todo: use in function */
           for (cycle = 0; cycle < sub_matrix; cycle++)
            {
                  
                Matrix_mul (sub_matrix, sub_A, sub_B, sub_C);      
            
            
               MPI_Cart_shift (comm2, 1, -1, &source, &destination);
               MPI_Sendrecv_replace (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 1, source, 1, comm2, &status1);
               
               MPI_Cart_shift (comm2, 0, -1, &source, &destination);
               MPI_Sendrecv_replace (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 2, source, 2, comm2, &status2);
            
     
             }
             
             }
/*end of master */


    else 
      {
         MPI_Recv (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE, 0, 1, comm2, &status1);
         MPI_Recv (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE, 0, 2, comm2, &status2);

         
       for (cycle = 0; cycle < sub_matrix; cycle ++)
         {
            Matrix_mul (sub_matrix, sub_A, sub_B, sub_C);
         
            MPI_Cart_shift (comm2, 1, -1, &source, &destination);
            MPI_Sendrecv_replace (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 1, source, 1, comm2, &status1);

            MPI_Cart_shift (comm2, 0, -1, &source, &destination);
            MPI_Sendrecv_replace (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 2, source, 2, comm2, &status2);
         }

        /* send final result to process 0 */
  
        MPI_Send (sub_C, sub_matrix * sub_matrix, MPI_DOUBLE, 0 , world_rank, comm2);
    
      }

    if (world_rank == 0)
      {
         Init_zero_Mat (NRA , C);
      
        int k;
     for (i =1; i < world_size; i++)
      {
         MPI_Recv (sub_CT, sub_matrix * sub_matrix, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,comm2, &status3);
         MPI_Cart_coords (comm2, status3.MPI_TAG, 2, send_coordinates);
          
         row_receive = send_coordinates [0];
         col_receive = send_coordinates [1];
         row_i = -1;
         column_i = 0;
         
         for ( j = row_receive * sub_matrix; j < row_receive * sub_matrix + sub_matrix; j++)
          {
            row_i ++;
            for ( k = col_receive * sub_matrix; k < col_receive * sub_matrix + sub_matrix; k++)
               {
                  C[j][k] = sub_CT[row_i][column_i];
                  column_i ++;
               }
             column_i = 0;
           }
        }
 
     /* On process 0 */
      for (i = 0; i < sub_matrix; i++)
        {
          for (j =0 ; j <sub_matrix; j++)
            {
                C[i][j] = sub_C[i][j];
            }
         }
      end_time = MPI_Wtime ();
      double serial = Verify (NRA, A,B,C);
      printf ("speedup :%f s",serial/( end_time - start_time));
      }

    MPI_Finalize ();
    return 0;
  }
Example #23
0
/*
   This program is from mpich/tsuite/pt2pt and should be changed there only.
   It needs gcomm and dtype from mpich/tsuite, and can be run with 
   any number of processes > 1.

   This version uses sendrecv and sendrecv_replace (but only in the
   head-to-head mode).
 */
int main( int argc, char **argv )
{
MPI_Datatype *types;
void         **inbufs, **outbufs;
char         **names;
int          *counts, *bytesize, ntype;
MPI_Comm     comms[20];
int          ncomm = 20, rank, np, partner, tag, count;
int          i, j, k, err, world_rank, errloc;
MPI_Status   status;
char         *obuf, *ibuf;

MPI_Init( &argc, &argv );

AllocateForData( &types, &inbufs, &outbufs, &counts, &bytesize, 
		 &names, &ntype );
GenerateData( types, inbufs, outbufs, counts, bytesize, names, &ntype );

MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
MakeComms( comms, 20, &ncomm, 0 );

/* Test over a wide range of datatypes and communicators */
err = 0;
for (i=0; i<ncomm; i++) {
    MPI_Comm_rank( comms[i], &rank );
    MPI_Comm_size( comms[i], &np );
    if (np < 2) continue;
    tag = i;
    if (rank == 0) 
	partner = np - 1;
    if (rank == np - 1)
	partner = 0;
    for (j=0; j<ntype; j++) {
	if (world_rank == 0) 
	    fprintf( stdout, "Testing type %s\n", names[j] );
        if (rank == 0 || rank == np - 1) {
	    obuf = outbufs[j];
	    for (k=0; k<bytesize[j]; k++) 
		obuf[k] = 0;
	    MPI_Sendrecv( inbufs[j], counts[j], types[j], partner, tag, 
			  outbufs[j], counts[j], types[j], partner, tag, 
			  comms[i], &status );
            /* Test correct */
            MPI_Get_count( &status, types[j], &count );
            if (count != counts[j]) {
		fprintf( stderr, 
			"Error in counts (got %d expected %d) with type %s\n",
			 count, counts[j], names[j] );
                err++;
                }
            if (status.MPI_SOURCE != partner) {
		fprintf( stderr, 
			"Error in source (got %d expected %d) with type %s\n",
			 status.MPI_SOURCE, partner, names[j] );
                err++;
                }
            if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) {
		char *p1, *p2;
		fprintf( stderr, 
                  "Error in data with type %s (type %d on %d) at byte %d\n", 
			 names[j], j, world_rank, errloc - 1 );
		p1 = (char *)inbufs[j];
		p2 = (char *)outbufs[j];
		fprintf( stderr, 
			"Got %x expected %x\n", p1[errloc-1], p2[errloc-1] );
                err++;
                }
	    /* Now do sendrecv_replace */
	    obuf = outbufs[j];
	    ibuf = inbufs[j];
	    for (k=0; k<bytesize[j]; k++) 
		obuf[k] = ibuf[k];
	    /* This would be a better test if the data was different... */
	    MPI_Sendrecv_replace( obuf, counts[j], types[j], partner, tag, 
				  partner, tag, comms[i], &status );
            /* Test correct */
            MPI_Get_count( &status, types[j], &count );
            if (count != counts[j]) {
		fprintf( stderr, 
			"Error in counts (got %d expected %d) with type %s\n",
			 count, counts[j], names[j] );
                err++;
                }
            if (status.MPI_SOURCE != partner) {
		fprintf( stderr, 
			"Error in source (got %d expected %d) with type %s\n",
			 status.MPI_SOURCE, partner, names[j] );
                err++;
                }
            if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) {
		char *p1, *p2;
		fprintf( stderr, 
                  "Error in data with type %s (type %d on %d) at byte %d\n", 
			 names[j], j, world_rank, errloc - 1 );
		p1 = (char *)inbufs[j];
		p2 = (char *)outbufs[j];
		fprintf( stderr, 
			"Got %x expected %x\n", p1[errloc-1], p2[errloc-1] );
                err++;
                }
            }
	}
    }
if (err > 0) {
    fprintf( stderr, "%d errors on %d\n", err, rank );
    }
FreeDatatypes( types, inbufs, outbufs, counts, bytesize, names, ntype );
FreeComms( comms, ncomm );
MPI_Finalize();
return err;
}
Example #24
0
int main (int argc, char **argv) {
	FILE *fp;
	double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL;
	double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL;
	int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size;
	int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size;
	int rank, size, sqrt_size, matrices_a_b_dimensions[4];
	MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator;
	MPI_Status status;

	// used to manage the cartesian grid
	int dimensions[2], periods[2], coordinates[2], remain_dims[2];

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* For square mesh */
	sqrt_size = (int)sqrt((double) size);
	if(sqrt_size * sqrt_size != size){
		if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n");
		MPI_Abort(MPI_COMM_WORLD, -1);
	}

	// create a 2D cartesian grid
	dimensions[0] = dimensions[1] = sqrt_size;
	periods[0] = periods[1] = 1;
	MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator);
	MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); //v COORDINATES imas shranjene koordinate procesa RANK

	// create a row communicator
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator);

	// create a column communicator
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator);


	// set time variables for different MPI parts
	double read_time, send_dim_time, send_blocks_time, gather_time, write_time, dod_cajt;


	read_time = MPI_Wtime();
	// getting matrices from files at rank 0 only
	// example: mpiexec -n 64 ./cannon matrix1 matrix2 [test]
	if (rank == 0){
		int row, column;
		if ((fp = fopen (argv[1], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]);
			A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *));
			for (row = 0; row < matrices_a_b_dimensions[0]; row++){
				A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double));
				for (column = 0; column < matrices_a_b_dimensions[1]; column++)
					fscanf(fp, "%lf", &A[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
		if((fp = fopen (argv[2], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]);
			B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *));
			for(row = 0; row < matrices_a_b_dimensions[2]; row++){
				B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *));
				for(column = 0; column < matrices_a_b_dimensions[3]; column++)
					fscanf(fp, "%lf", &B[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// need to check that the multiplication is possible given dimensions
		// matrices_a_b_dimensions[0] = row size of A
		// matrices_a_b_dimensions[1] = column size of A
		// matrices_a_b_dimensions[2] = row size of B
		// matrices_a_b_dimensions[3] = column size of B
		if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){
			if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n",
					matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// this implementation is limited to cases where thematrices can be partitioned perfectly
		if( matrices_a_b_dimensions[0] % sqrt_size != 0
				|| matrices_a_b_dimensions[1] % sqrt_size != 0
				|| matrices_a_b_dimensions[2] % sqrt_size != 0
				|| matrices_a_b_dimensions[3] % sqrt_size != 0 ){
			if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processes\n"
					"all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n",
					matrices_a_b_dimensions[0],matrices_a_b_dimensions[1],
					matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size );
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
	}

	read_time -= MPI_Wtime();

	send_dim_time = MPI_Wtime();
	// send dimensions to all peers
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	//has to be blocking, bcs data is used right afterwards...
	MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator);
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	send_dim_time -= MPI_Wtime();


	A_rows = matrices_a_b_dimensions[0];
	A_columns = matrices_a_b_dimensions[1];
	B_rows = matrices_a_b_dimensions[2];
	B_columns = matrices_a_b_dimensions[3];

	// local metadata for A
	A_local_block_rows = A_rows / sqrt_size;
	A_local_block_columns = A_columns / sqrt_size;
	A_local_block_size = A_local_block_rows * A_local_block_columns;
	A_local_block = (double *) malloc (A_local_block_size * sizeof(double));

	// local metadata for B
	B_local_block_rows = B_rows / sqrt_size;
	B_local_block_columns = B_columns / sqrt_size;
	B_local_block_size = B_local_block_rows * B_local_block_columns;
	B_local_block = (double *) malloc (B_local_block_size * sizeof(double));

	// local metadata for C
	C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double));
	// C needs to be initialized at 0 (accumulates partial dot-products)
	int i,j;
	for(i=0; i < A_local_block_rows * B_local_block_columns; i++){
		C_local_block[i] = 0;
	}

	dod_cajt = MPI_Wtime();
	// full arrays only needed at root
	if(rank == 0){
		A_array = (double *) malloc(sizeof(double) * A_rows * A_columns);
		B_array = (double *) malloc(sizeof(double) * B_rows * B_columns);
		C_array = (double *) malloc(sizeof(double) * A_rows * B_columns);
		// generate the 1D arrays of the matrices at root
		int row, column, i, j;
		for (i = 0; i < sqrt_size; i++){
			for (j = 0; j < sqrt_size; j++){
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < A_local_block_columns; column++){
						A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = 
A[i * A_local_block_rows + row][j * A_local_block_columns + column];
					}
				}
				for (row = 0; row < B_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = 
B[i * B_local_block_rows + row][j * B_local_block_columns + column];
					}
				}
			}
		}
		// allocate output matrix C
		C = (double **) malloc(A_rows * sizeof(double *));
		for(i=0; i<A_rows ;i++){
			C[i] = (double *) malloc(B_columns * sizeof(double));
		}
	}

	
	dod_cajt -= MPI_Wtime(); //a bi mogla dat to v send blocks time?

	// send a block to each process
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	/*MPI_Scatter(A_array, A_local_block_size, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);
	MPI_Scatter(B_array, B_local_block_size, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);
	*/

	send_blocks_time = MPI_Wtime();

	int displsA[size];
	int displsB[size];
	int localblsizA[size];
	int localblsizB[size];
	MPI_Request requests[2];
	MPI_Status statuses[2];
	for (i=0; i<sqrt_size; i++){
		for (j=0; j<sqrt_size; j++){
			displsA[i*sqrt_size + j] = (i*sqrt_size + j)*A_local_block_size; //(i*sqrt_size + (j+i)%sqrt_size)*A_local_block_size;
			displsB[i*sqrt_size + j] = (i*sqrt_size + j)*B_local_block_size; //(j + ((j+i)%size)*sqrt_size)*B_local_block_size;
			localblsizA[i*sqrt_size+j] = A_local_block_size;
			localblsizB[i*sqrt_size+j] = B_local_block_size;
		}
	}
	MPI_Iscatterv(A_array, localblsizA, displsA, MPI_DOUBLE, A_local_block, A_local_block_size,
        		MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[0]);
	MPI_Iscatterv(B_array, localblsizB, displsB, MPI_DOUBLE, B_local_block, B_local_block_size,
        		MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[1]);

	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


	// fix initial arrangements before the core algorithm starts - fora je, da se preden se prvic zacne computational part of algo, moras ze bloke zamenjat...
	/*if(coordinates[0] != 0){
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE,
				(coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0,
				(coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status);
	}
	if(coordinates[1] != 0){
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE,
				(coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0,
				(coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status);
	}*/

	// cannon's algorithm
	int cannon_block_cycle;
	double compute_time = 0, mpi_time = 0, start;
	int C_index, A_row, A_column, B_column;

	MPI_Waitall(2, requests, statuses);
	send_blocks_time -= MPI_Wtime();


	for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){
		// compute partial result for this block cycle
		start = MPI_Wtime();
		for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){
			for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){
				for(A_column = 0; A_column < A_local_block_columns; A_column++){
					C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * 
B_local_block[A_column * B_local_block_columns + B_column];
				}
			}
		}
		compute_time += MPI_Wtime() - start;
		start = MPI_Wtime();
		// rotate blocks horizontally
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, //to bi slo z MPI_alltoallv, in tisto variablo za replacing. ampak bi blo inefficient - glej komentarje!
				(coordinates[1] + sqrt_size - 1) % sqrt_size, 0,
				(coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status);
		// rotate blocks vertically
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE,
				(coordinates[0] + sqrt_size - 1) % sqrt_size, 0,
				(coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status);
		mpi_time += MPI_Wtime() - start;
	}


	// get C parts from other processes at rank 0
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	gather_time =  MPI_Wtime();
	MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
               		 C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
                   0, cartesian_grid_communicator);  //blocking, ker gres takoj nekaj delat s tem pol... right?
	gather_time -= MPI_Wtime();
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	// generating output at rank 0
	if (rank == 0) {
		write_time = MPI_Wtime();
		// convert the ID array into the actual C matrix
		int i, j, k, row, column;
		for (i = 0; i < sqrt_size; i++){  // block row index
			for (j = 0; j < sqrt_size; j++){ // block column index
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						C[i * A_local_block_rows + row] [j * B_local_block_columns + column] =
							C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns)
							+ (row * B_local_block_columns) + column];
					}
				}
			}
		}
		write_time -= MPI_Wtime();

		printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns);
		printf("Computation time: %lf\n", compute_time);
		printf("MPI time:         %lf\n", mpi_time);
		printf("Read time:        %lf\n", -read_time);
		printf("Send dims time:   %lf\n", -send_dim_time);
		printf("Send blocks time: %lf\n", -send_blocks_time);
		printf("Gather time:      %lf\n", -gather_time);
		printf("Addit. time:      %lf\n", -dod_cajt);
		printf("Write time:       %lf\n", -write_time);

		if (argc == 4){
			// present results on the screen
			printf("\nA( %d x %d ):\n", A_rows, A_columns);
			for(row = 0; row < A_rows; row++) {
				for(column = 0; column < A_columns; column++)
					printf ("%7.3f ", A[row][column]);
				printf ("\n");
			}
			printf("\nB( %d x %d ):\n", B_rows, B_columns);
			for(row = 0; row < B_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ", B[row][column]);
				printf("\n");
			}
			printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns);
			for(row = 0; row < A_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ",C[row][column]);
				printf("\n");
			}


			printf("\nPerforming serial consistency check. Be patient...\n");
			fflush(stdout);
			int pass = 1;
			double temp;
			for(i=0; i<A_rows; i++){
				for(j=0; j<B_columns; j++){
					temp = 0;
					for(k=0; k<B_rows; k++){
						temp += A[i][k] * B[k][j];
					}
					//printf("%7.3f ", temp);
					printf("%7.3f ", temp-C[i][j]);
					if(temp != C[i][j]){
						pass = 0;
					}
				}
				printf("\n");
			}
			if (pass) printf("Consistency check: PASS\n");
			else printf("Consistency check: FAIL\n");
		}
	}

	// free all memory
	if(rank == 0){
		int i;
		for(i = 0; i < A_rows; i++){
			free(A[i]);
		}
		for(i = 0; i < B_rows; i++){
			free(B[i]);
		}
		for(i = 0; i < A_rows; i++){
			free(C[i]);
		}
		free(A);
		free(B);
		free(C);
		free(A_array);
		free(B_array);
		free(C_array);
	}
	free(A_local_block);
	free(B_local_block);
	free(C_local_block);

	// finalize MPI
	MPI_Finalize();
}
Example #25
0
File: MPI-api.c Project: 8l/rose
void declareBindings (void)
{
  /* === Point-to-point === */
  void* buf;
  int count;
  MPI_Datatype datatype;
  int dest;
  int tag;
  MPI_Comm comm;
  MPI_Send (buf, count, datatype, dest, tag, comm); // L12
  int source;
  MPI_Status status;
  MPI_Recv (buf, count, datatype, source, tag, comm, &status); // L15
  MPI_Get_count (&status, datatype, &count);
  MPI_Bsend (buf, count, datatype, dest, tag, comm);
  MPI_Ssend (buf, count, datatype, dest, tag, comm);
  MPI_Rsend (buf, count, datatype, dest, tag, comm);
  void* buffer;
  int size;
  MPI_Buffer_attach (buffer, size); // L22
  MPI_Buffer_detach (buffer, &size);
  MPI_Request request;
  MPI_Isend (buf, count, datatype, dest, tag, comm, &request); // L25
  MPI_Ibsend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Issend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Irsend (buf, count, datatype, dest, tag, comm, &request);
  MPI_Irecv (buf, count, datatype, source, tag, comm, &request);
  MPI_Wait (&request, &status);
  int flag;
  MPI_Test (&request, &flag, &status); // L32
  MPI_Request_free (&request);
  MPI_Request* array_of_requests;
  int index;
  MPI_Waitany (count, array_of_requests, &index, &status); // L36
  MPI_Testany (count, array_of_requests, &index, &flag, &status);
  MPI_Status* array_of_statuses;
  MPI_Waitall (count, array_of_requests, array_of_statuses); // L39
  MPI_Testall (count, array_of_requests, &flag, array_of_statuses);
  int incount;
  int outcount;
  int* array_of_indices;
  MPI_Waitsome (incount, array_of_requests, &outcount, array_of_indices,
		array_of_statuses); // L44--45
  MPI_Testsome (incount, array_of_requests, &outcount, array_of_indices,
		array_of_statuses); // L46--47
  MPI_Iprobe (source, tag, comm, &flag, &status); // L48
  MPI_Probe (source, tag, comm, &status);
  MPI_Cancel (&request);
  MPI_Test_cancelled (&status, &flag);
  MPI_Send_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Bsend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Ssend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Rsend_init (buf, count, datatype, dest, tag, comm, &request);
  MPI_Recv_init (buf, count, datatype, source, tag, comm, &request);
  MPI_Start (&request);
  MPI_Startall (count, array_of_requests);
  void* sendbuf;
  int sendcount;
  MPI_Datatype sendtype;
  int sendtag;
  void* recvbuf;
  int recvcount;
  MPI_Datatype recvtype;
  MPI_Datatype recvtag;
  MPI_Sendrecv (sendbuf, sendcount, sendtype, dest, sendtag,
		recvbuf, recvcount, recvtype, source, recvtag,
		comm, &status); // L67--69
  MPI_Sendrecv_replace (buf, count, datatype, dest, sendtag, source, recvtag,
			comm, &status); // L70--71
  MPI_Datatype oldtype;
  MPI_Datatype newtype;
  MPI_Type_contiguous (count, oldtype, &newtype); // L74
  int blocklength;
  {
    int stride;
    MPI_Type_vector (count, blocklength, stride, oldtype, &newtype); // L78
  }
  {
    MPI_Aint stride;
    MPI_Type_hvector (count, blocklength, stride, oldtype, &newtype); // L82
  }
  int* array_of_blocklengths;
  {
    int* array_of_displacements;
    MPI_Type_indexed (count, array_of_blocklengths, array_of_displacements,
		      oldtype, &newtype); // L87--88
  }
  {
    MPI_Aint* array_of_displacements;
    MPI_Type_hindexed (count, array_of_blocklengths, array_of_displacements,
                       oldtype, &newtype); // L92--93
    MPI_Datatype* array_of_types;
    MPI_Type_struct (count, array_of_blocklengths, array_of_displacements,
                     array_of_types, &newtype); // L95--96
  }
  void* location;
  MPI_Aint address;
  MPI_Address (location, &address); // L100
  MPI_Aint extent;
  MPI_Type_extent (datatype, &extent); // L102
  MPI_Type_size (datatype, &size);
  MPI_Aint displacement;
  MPI_Type_lb (datatype, &displacement); // L105
  MPI_Type_ub (datatype, &displacement);
  MPI_Type_commit (&datatype);
  MPI_Type_free (&datatype);
  MPI_Get_elements (&status, datatype, &count);
  void* inbuf;
  void* outbuf;
  int outsize;
  int position;
  MPI_Pack (inbuf, incount, datatype, outbuf, outsize, &position, comm); // L114
  int insize;
  MPI_Unpack (inbuf, insize, &position, outbuf, outcount, datatype,
	      comm); // L116--117
  MPI_Pack_size (incount, datatype, comm, &size);

  /* === Collectives === */
  MPI_Barrier (comm); // L121
  int root;
  MPI_Bcast (buffer, count, datatype, root, comm); // L123
  MPI_Gather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
	      root, comm); // L124--125
  int* recvcounts;
  int* displs;
  MPI_Gatherv (sendbuf, sendcount, sendtype,
               recvbuf, recvcounts, displs, recvtype,
	       root, comm); // L128--130
  MPI_Scatter (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
               root, comm); // L131--132
  int* sendcounts;
  MPI_Scatterv (sendbuf, sendcounts, displs, sendtype,
		recvbuf, recvcount, recvtype, root, comm); // L134--135
  MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
                 comm); // L136--137
  MPI_Allgatherv (sendbuf, sendcount, sendtype,
		  recvbuf, recvcounts, displs, recvtype,
		  comm); // L138--140
  MPI_Alltoall (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
		comm); // L141--142
  int* sdispls;
  int* rdispls;
  MPI_Alltoallv (sendbuf, sendcounts, sdispls, sendtype,
                 recvbuf, recvcounts, rdispls, recvtype,
		 comm); // L145--147
  MPI_Op op;
  MPI_Reduce (sendbuf, recvbuf, count, datatype, op, root, comm); // L149
#if 0
  MPI_User_function function;
  int commute;
  MPI_Op_create (function, commute, &op); // L153
#endif
  MPI_Op_free (&op); // L155
  MPI_Allreduce (sendbuf, recvbuf, count, datatype, op, comm);
  MPI_Reduce_scatter (sendbuf, recvbuf, recvcounts, datatype, op, comm);
  MPI_Scan (sendbuf, recvbuf, count, datatype, op, comm);

  /* === Groups, contexts, and communicators === */
  MPI_Group group;
  MPI_Group_size (group, &size); // L162
  int rank;
  MPI_Group_rank (group, &rank); // L164
  MPI_Group group1;
  int n;
  int* ranks1;
  MPI_Group group2;
  int* ranks2;
  MPI_Group_translate_ranks (group1, n, ranks1, group2, ranks2); // L170
  int result;
  MPI_Group_compare (group1, group2, &result); // L172
  MPI_Group newgroup;
  MPI_Group_union (group1, group2, &newgroup); // L174
  MPI_Group_intersection (group1, group2, &newgroup);
  MPI_Group_difference (group1, group2, &newgroup);
  int* ranks;
  MPI_Group_incl (group, n, ranks, &newgroup); // L178
  MPI_Group_excl (group, n, ranks, &newgroup);
  extern int ranges[][3];
  MPI_Group_range_incl (group, n, ranges, &newgroup); // L181
  MPI_Group_range_excl (group, n, ranges, &newgroup);
  MPI_Group_free (&group);
  MPI_Comm_size (comm, &size);
  MPI_Comm_rank (comm, &rank);
  MPI_Comm comm1;
  MPI_Comm comm2;
  MPI_Comm_compare (comm1, comm2, &result);
  MPI_Comm newcomm;
  MPI_Comm_dup (comm, &newcomm);
  MPI_Comm_create (comm, group, &newcomm);
  int color;
  int key;
  MPI_Comm_split (comm, color, key, &newcomm); // L194
  MPI_Comm_free (&comm);
  MPI_Comm_test_inter (comm, &flag);
  MPI_Comm_remote_size (comm, &size);
  MPI_Comm_remote_group (comm, &group);
  MPI_Comm local_comm;
  int local_leader;
  MPI_Comm peer_comm;
  int remote_leader;
  MPI_Comm newintercomm;
  MPI_Intercomm_create (local_comm, local_leader, peer_comm, remote_leader, tag,
			&newintercomm); // L204--205
  MPI_Comm intercomm;
  MPI_Comm newintracomm;
  int high;
  MPI_Intercomm_merge (intercomm, high, &newintracomm); // L209
  int keyval;
#if 0
  MPI_Copy_function copy_fn;
  MPI_Delete_function delete_fn;
  void* extra_state;
  MPI_Keyval_create (copy_fn, delete_fn, &keyval, extra_state); // L215
#endif
  MPI_Keyval_free (&keyval); // L217
  void* attribute_val;
  MPI_Attr_put (comm, keyval, attribute_val); // L219
  MPI_Attr_get (comm, keyval, attribute_val, &flag);
  MPI_Attr_delete (comm, keyval);

  /* === Environmental inquiry === */
  char* name;
  int resultlen;
  MPI_Get_processor_name (name, &resultlen); // L226
  MPI_Errhandler errhandler;
#if 0
  MPI_Handler_function function;
  MPI_Errhandler_create (function, &errhandler); // L230
#endif
  MPI_Errhandler_set (comm, errhandler); // L232
  MPI_Errhandler_get (comm, &errhandler);
  MPI_Errhandler_free (&errhandler);
  int errorcode;
  char* string;
  MPI_Error_string (errorcode, string, &resultlen); // L237
  int errorclass;
  MPI_Error_class (errorcode, &errorclass); // L239
  MPI_Wtime ();
  MPI_Wtick ();
  int argc;
  char** argv;
  MPI_Init (&argc, &argv); // L244
  MPI_Finalize ();
  MPI_Initialized (&flag);
  MPI_Abort (comm, errorcode);
}
Example #26
0
void exchange_v(float ** vy,
	float ** bufferlef_to_rig, float ** bufferrig_to_lef, 
	float ** buffertop_to_bot, float ** bufferbot_to_top,
	MPI_Request * req_send, MPI_Request * req_rec){


	extern int NX, NY, POS[3], NPROCX, NPROCY, BOUNDARY, FDORDER;
	extern int INDEX[5];
	extern const int TAG1,TAG2,TAG5,TAG6;
	MPI_Status  status;
	int i, j, fdo, fdo3, n, l;

	fdo = FDORDER/2 + 1;
	fdo3 = 2*fdo;

	/* top - bottom */

	if (POS[2]!=0)	/* no boundary exchange at top of global grid */
	for (i=1;i<=NX;i++){
		n = 1;
		/* storage of top of local volume into buffer */
		for (l=1;l<=fdo-1;l++) {
			buffertop_to_bot[i][n++]  =  vy[l][i];
		}
		
	}


	if (POS[2]!=NPROCY-1)	/* no boundary exchange at bottom of global grid */
	for (i=1;i<=NX;i++){
		/* storage of bottom of local volume into buffer */
		n = 1;
		/*for (l=1;l<=fdo;l++) {
			bufferbot_to_top[i][n++]  =  vy[NY-l+1][i];
		}*/
		
	}
	
	
  	 /* send and reveive values for points at inner boundaries */

/*
	MPI_Bsend(&buffertop_to_bot[1][1],NX*fdo3,MPI_FLOAT,INDEX[3],TAG5,MPI_COMM_WORLD);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Recv(&buffertop_to_bot[1][1],NX*fdo3,MPI_FLOAT,INDEX[4],TAG5,MPI_COMM_WORLD,&status);
	MPI_Bsend(&bufferbot_to_top[1][1],NX*fdo3,MPI_FLOAT,INDEX[4],TAG6,MPI_COMM_WORLD);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Recv(&bufferbot_to_top[1][1],NX*fdo3,MPI_FLOAT,INDEX[3],TAG6,MPI_COMM_WORLD,&status);   
*/

	/* send and reveive values at edges of the local grid */
	/*for (i=2;i<=3;i++){
		MPI_Start(&req_send[i]);
		MPI_Wait(&req_send[i],&status);
		MPI_Start(&req_rec[i]);
		MPI_Wait(&req_rec[i],&status);
	}*/
	
	/* alternative communication */
	/* still blocking communication */
	MPI_Sendrecv_replace(&buffertop_to_bot[1][1],NX*fdo3,MPI_FLOAT,INDEX[3],TAG5,INDEX[4],TAG5,MPI_COMM_WORLD,&status);
	/*MPI_Sendrecv_replace(&bufferbot_to_top[1][1],NX*fdo3,MPI_FLOAT,INDEX[4],TAG6,INDEX[3],TAG6,MPI_COMM_WORLD,&status);*/



	if (POS[2]!=NPROCY-1)	/* no boundary exchange at bottom of global grid */
	for (i=1;i<=NX;i++){
		n = 1;
		for (l=1;l<=fdo-1;l++) {
			vy[NY+l][i] = buffertop_to_bot[i][n++];
		}
		/*for (l=1;l<=fdo;l++) {
			vx[NY+l][i] = buffertop_to_bot[i][n++];
		}*/
	}
	


	if (POS[2]!=0)	/* no boundary exchange at top of global grid */
	for (i=1;i<=NX;i++){
		n = 1;
		/*for (l=1;l<=fdo;l++) {
			vy[1-l][i] = bufferbot_to_top[i][n++];
		}*/
		/*for (l=1;l<=fdo-1;l++) {
			vx[1-l][i] = bufferbot_to_top[i][n++];
		}*/
	}	
	
	
	/* left - right */

	/* exchange if periodic boundary condition is applied */
	if ((BOUNDARY) || (POS[1]!=0))	
	for (j=1;j<=NY;j++){
		/* storage of left edge of local volume into buffer */
		n = 1;
		for (l=1;l<fdo;l++) {
			bufferlef_to_rig[j][n++] =  vy[j][l];
		}
	}


	/* no exchange if periodic boundary condition is applied */
	if ((BOUNDARY) || (POS[1]!=NPROCX-1))	/* no boundary exchange at right edge of global grid */
	for (j=1;j<=NY;j++){
		/* storage of right edge of local volume into buffer */
		n = 1;
		/*for (l=1;l<fdo-1;l++) {
			bufferrig_to_lef[j][n++] =  vy[j][NX-l+1];
		}*/
	}

 	 /* send and reveive values for points at inner boundaries */

/*
 	MPI_Bsend(&bufferlef_to_rig[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[1],TAG1,MPI_COMM_WORLD);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Recv(&bufferlef_to_rig[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[2],TAG1,MPI_COMM_WORLD,&status);
	MPI_Bsend(&bufferrig_to_lef[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[2],TAG2,MPI_COMM_WORLD);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Recv(&bufferrig_to_lef[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[1],TAG2,MPI_COMM_WORLD,&status);
*/


	/* send and reveive values at edges of the local grid */
	/*for (i=0;i<=1;i++){
		MPI_Start(&req_send[i]);
		MPI_Wait(&req_send[i],&status);
		MPI_Start(&req_rec[i]);
		MPI_Wait(&req_rec[i],&status);
	}*/
	
	/* alternative communication */
	/* still blocking communication */
	MPI_Sendrecv_replace(&bufferlef_to_rig[1][1],NY*fdo3,MPI_FLOAT,INDEX[1],TAG1,INDEX[2],TAG1,MPI_COMM_WORLD,&status);
	/*MPI_Sendrecv_replace(&bufferrig_to_lef[1][1],NY*fdo3,MPI_FLOAT,INDEX[2],TAG2,INDEX[1],TAG2,MPI_COMM_WORLD,&status);*/


	/* no exchange if periodic boundary condition is applied */
	if ((BOUNDARY) || (POS[1]!=NPROCX-1))	/* no boundary exchange at right edge of global grid */
	for (j=1;j<=NY;j++){
		n = 1;
		for (l=1;l<fdo;l++) {
			vy[j][NX+l] = bufferlef_to_rig[j][n++];
		}
	}

	/* no exchange if periodic boundary condition is applied */
	if ((BOUNDARY) || (POS[1]!=0))	/* no boundary exchange at left edge of global grid */
	for (j=1;j<=NY;j++){
		n = 1;
		/*for (l=1;l<fdo-1;l++) {
			vy[j][1-l] = bufferrig_to_lef[j][n++];
		}*/
	}
	
		

}
Example #27
0
/* Performs Gaussian elimination on the given matrix of doubles.  The
 * parameter numRows gives the number of rows in the matrix, and
 * numCols the number of columns.  Upon return, the matrix will be in
 * reduced row-echelon form.
 */
int gausselim(double* matrix, int numRows, int numCols, int debug) {
  int top = 0;           // the current top row of the matrix
  int col = 0;           // column index of the current pivot
  int pivotRow = 0;      // row index of current pivot
  double pivot = 0.0;    // the value of the current pivot
  int j = 0;             // loop variable over columns of matrix
  double tmp = 0.0;      // temporary double variable
  MPI_Status status;     // status object needed for receives
  int rank;              // rank of this process
  int nprocs;            // number of processes
  double* toprow = (double*)malloc(numCols * sizeof(double));

  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  
  for (top=col=0; top<numRows && col< numCols; top++, col++) {

    /* At this point we know that the submatrix consisting of the
     * first top rows of A is in reduced row-echelon form.  We will now
     * consider the submatrix B consisting of the remaining rows.  We
     * know, additionally, that the first col columns of B are
     * all zero.
     */

    if (debug && rank == 0) {
      printf("Top: %d\n", top);
    }

    /* Step 1: Locate the leftmost column of B that does not consist
     * of all zeros, if one exists.  The top nonzero entry of this
     * column is the pivot. */
  
    for (; col < numCols; col++) {
      if (matrix[col] != 0.0 && rank >= top) {
	MPI_Allreduce(&rank, &pivotRow, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
      }
      else {
	MPI_Allreduce(&nprocs, &pivotRow, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
      }
      if (pivotRow < nprocs){
	break;
      }
    }

    if (col >= numCols) {
      break;
    }

    if (debug) {
      if (rank == 0) {
	printf("Step 1 result: col=%d, pivotRow=%d\n\n", col, pivotRow);
      }
    }
      
    /* At this point we are guaranteed that pivot = A[pivotRow,col] is
     * nonzero.  We also know that all the columns of B to the left of
     * col consist entirely of zeros. */

    /* Step 2: Interchange the top row with the pivot row, if
     * necessary, so that the entry at the top of the column found in
     * Step 1 is nonzero. */

    if (pivotRow != top) {
      if (rank == top) {
	MPI_Sendrecv_replace(matrix, numCols, MPI_DOUBLE, pivotRow, 0, 
                             pivotRow, 0, MPI_COMM_WORLD, &status);
      }
      else if (rank == pivotRow) {
	MPI_Sendrecv_replace(matrix, numCols, MPI_DOUBLE, top, 0, 
                             top, 0,  MPI_COMM_WORLD, &status);
      }
    }

    if (rank == top) {
      pivot = matrix[col];
    }
    
    if (debug) {
      printMatrix("Step 2 result: \n", matrix, numRows, numCols);
    }

    /* At this point we are guaranteed that A[top,col] = pivot is
     * nonzero. Also, we know that (i>=top and j<col) implies
     * A[i,j] = 0. */

    /* Step 3: Divide the top row by pivot in order to introduce a
     * leading 1. */

    if (rank == top) {
      for (j = col; j < numCols; j++) {
	matrix[j] /= pivot;
	toprow[j] = matrix[j];
      }
    }

    if (debug) {
      printMatrix("Step 3 result:\n", matrix, numRows, numCols);
    }

    /* At this point we are guaranteed that A[top,col] is 1.0,
     * assuming that floating point arithmetic guarantees that a/a
     * equals 1.0 for any nonzero double a. */

    MPI_Bcast(toprow, numCols, MPI_DOUBLE, top, MPI_COMM_WORLD);

    /* Step 4: Add suitable multiples of the top row to rows below so
     * that all entries below the leading 1 become zero. */

    if (rank != top) {
      tmp = matrix[col];
      for (j = col; j < numCols; j++) {
	matrix[j] -= toprow[j]*tmp;
      }
    }

    if (debug) {
      printMatrix("Step 4 result: \n", matrix, numRows, numCols);
    }
  }
  free(toprow);

  return 0;
}
Example #28
0
static void test_pair (void)
{
  int prev, next, count, tag, index, i, outcount, indices[2];
  int rank, size, flag, ierr, reqcount;
  double send_buf[TEST_SIZE], recv_buf[TEST_SIZE];
  double buffered_send_buf[TEST_SIZE * 2 + MPI_BSEND_OVERHEAD]; /* factor of two is based on guessing - only dynamic allocation would be safe */
  void *buffer;
  MPI_Status statuses[2];
  MPI_Status status;
  MPI_Request requests[2];
  MPI_Comm dupcom, intercom;
#ifdef V_T

  struct _VT_FuncFrameHandle {
      char *name;
      int func;
      int frame;
  };
  typedef struct _VT_FuncFrameHandle VT_FuncFrameHandle_t;

  VT_FuncFrameHandle_t normal_sends,
      buffered_sends,
      buffered_persistent_sends,
      ready_sends,
      sync_sends,
      nblock_sends,
      nblock_rsends,
      nblock_ssends,
      pers_sends,
      pers_rsends,
      pers_ssends,
      sendrecv,
      sendrecv_repl,
      intercomm;

  int classid;
  VT_classdef( "Application:test_pair", &classid );


#define VT_REGION_DEF( _name, _nameframe, _class ) \
        (_nameframe).name=_name; \
        VT_funcdef( (_nameframe).name, _class, &((_nameframe).func) );
#define VT_BEGIN_REGION( _nameframe ) \
        LOCDEF(); \
        VT_begin( (_nameframe).func )
#define VT_END_REGION( _nameframe ) \
        LOCDEF(); VT_end( (_nameframe).func )
#else
#define VT_REGION_DEF( _name, _nameframe, _class )
#define VT_BEGIN_REGION( _nameframe )
#define VT_END_REGION( _nameframe )

#endif




  ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  ierr = MPI_Comm_size(MPI_COMM_WORLD, &size);
  if ( size < 2 ) {
      if ( rank == 0 ) {
	  printf("Program needs to be run on at least 2 processes.\n");
      }
      ierr = MPI_Abort( MPI_COMM_WORLD, 66 );
  }
  ierr = MPI_Comm_dup(MPI_COMM_WORLD, &dupcom);

  if ( rank >= 2 ) {
      /*      printf( "%d Calling finalize.\n", rank ); */
      ierr = MPI_Finalize( );
      exit(0);
  }

  next = rank + 1;
  if (next >= 2)
    next = 0;

  prev = rank - 1;
  if (prev < 0)
    prev = 1;

  VT_REGION_DEF( "Normal_Sends", normal_sends, classid );
  VT_REGION_DEF( "Buffered_Sends", buffered_sends, classid );
  VT_REGION_DEF( "Buffered_Persistent_Sends", buffered_persistent_sends, classid );
  VT_REGION_DEF( "Ready_Sends", ready_sends, classid );
  VT_REGION_DEF( "Sync_Sends", sync_sends, classid );
  VT_REGION_DEF( "nblock_Sends", nblock_sends, classid );
  VT_REGION_DEF( "nblock_RSends", nblock_rsends, classid );
  VT_REGION_DEF( "nblock_SSends", nblock_ssends, classid );
  VT_REGION_DEF( "Pers_Sends", pers_sends, classid );
  VT_REGION_DEF( "Pers_RSends", pers_rsends, classid );
  VT_REGION_DEF( "Pers_SSends", pers_ssends, classid );
  VT_REGION_DEF( "SendRecv", sendrecv, classid );
  VT_REGION_DEF( "SendRevc_Repl", sendrecv_repl, classid );
  VT_REGION_DEF( "InterComm", intercomm, classid );



/*
 * Normal sends
 */

  VT_BEGIN_REGION( normal_sends );

  if (rank == 0)
    printf ("Send\n");

  tag = 0x100;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);

    LOCDEF();

    MPI_Send(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
              MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    msg_check(recv_buf, prev, tag, count, &status, TEST_SIZE, "send and recv");
  }
  else {

    LOCDEF();

    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,"send and recv");
    init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);

  }

  VT_END_REGION( normal_sends );


/*
 * Buffered sends
 */

  VT_BEGIN_REGION( buffered_sends );

  if (rank == 0)
    printf ("Buffered Send\n");

  tag = 138;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);

    LOCDEF();

    MPI_Buffer_attach(buffered_send_buf, sizeof(buffered_send_buf));
    MPI_Bsend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
    MPI_Buffer_detach(&buffer, &size);
    if(buffer != buffered_send_buf || size != sizeof(buffered_send_buf)) {
        printf ("[%d] Unexpected buffer returned by MPI_Buffer_detach(): %p/%d != %p/%d\n", rank, buffer, size, buffered_send_buf, (int)sizeof(buffered_send_buf));
        MPI_Abort(MPI_COMM_WORLD, 201);
    }
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
              MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    msg_check(recv_buf, prev, tag, count, &status, TEST_SIZE, "send and recv");
  }
  else {

    LOCDEF();

    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,"send and recv");
    init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);

  }

  VT_END_REGION( buffered_sends );


/*
 * Buffered sends
 */

  VT_BEGIN_REGION( buffered_persistent_sends );

  if (rank == 0)
    printf ("Buffered Persistent Send\n");

  tag = 238;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);

    LOCDEF();

    MPI_Buffer_attach(buffered_send_buf, sizeof(buffered_send_buf));
    MPI_Bsend_init(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, requests);
    MPI_Start(requests);
    MPI_Wait(requests, statuses);
    MPI_Request_free(requests);
    MPI_Buffer_detach(&buffer, &size);
    if(buffer != buffered_send_buf || size != sizeof(buffered_send_buf)) {
        printf ("[%d] Unexpected buffer returned by MPI_Buffer_detach(): %p/%d != %p/%d\n", rank, buffer, size, buffered_send_buf, (int)sizeof(buffered_send_buf));
        MPI_Abort(MPI_COMM_WORLD, 201);
    }
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
              MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    msg_check(recv_buf, prev, tag, count, &status, TEST_SIZE, "send and recv");
  }
  else {

    LOCDEF();

    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,"send and recv");
    init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);

  }

  VT_END_REGION( buffered_persistent_sends );


/*
 * Ready sends.  Note that we must insure that the receive is posted
 * before the rsend; this requires using Irecv.
 */


  VT_BEGIN_REGION( ready_sends );

  if (rank == 0)
    printf ("Rsend\n");

  tag = 1456;
  count = TEST_SIZE / 3;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);
    MPI_Recv(MPI_BOTTOM, 0, MPI_INT, next, tag, MPI_COMM_WORLD, &status);
    MPI_Rsend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
    MPI_Probe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &status);
    if (status.MPI_SOURCE != prev)
      printf ("Incorrect src, expected %d, got %d\n",prev, status.MPI_SOURCE);

    if (status.MPI_TAG != tag)
      printf ("Incorrect tag, expected %d, got %d\n",tag, status.MPI_TAG);

    MPI_Get_count(&status, MPI_DOUBLE, &i);
    if (i != count)
      printf ("Incorrect count, expected %d, got %d\n",count,i);

    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);

    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "rsend and recv");
  }
  else {
    MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
              MPI_COMM_WORLD, requests);
    MPI_Send( MPI_BOTTOM, 0, MPI_INT, next, tag, MPI_COMM_WORLD);
    MPI_Wait(requests, &status);

    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "rsend and recv");
    init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
  }

  VT_END_REGION( ready_sends );

/*
 * Synchronous sends
 */

  VT_BEGIN_REGION( sync_sends );

  if (rank == 0)
    printf ("Ssend\n");

  tag = 1789;
  count = TEST_SIZE / 3;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);
    MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &status);
    if (flag)
      printf ("Iprobe succeeded! source %d, tag %d\n",status.MPI_SOURCE,
                                                      status.MPI_TAG);

    MPI_Ssend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);

    while (!flag)
      MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &status);

    if (status.MPI_SOURCE != prev)
      printf ("Incorrect src, expected %d, got %d\n",prev, status.MPI_SOURCE);

    if (status.MPI_TAG != tag)
      printf ("Incorrect tag, expected %d, got %d\n",tag, status.MPI_TAG);

    MPI_Get_count(&status, MPI_DOUBLE, &i);

    if (i != count)
      printf ("Incorrect count, expected %d, got %d\n",count,i);

    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "ssend and recv");
  }
  else {
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "ssend and recv"); init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Ssend(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
  }

  VT_END_REGION( sync_sends );

/*
 * Nonblocking normal sends
 */

  VT_BEGIN_REGION( nblock_sends );

  if (rank == 0)
    printf ("Isend\n");

  tag = 2123;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
    MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
              MPI_COMM_WORLD, requests);
    init_test_data(send_buf,TEST_SIZE,0);
    MPI_Isend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD,
              (requests+1));
    MPI_Waitall(2, requests, statuses);
    rq_check( requests, 2, "isend and irecv" );

    msg_check(recv_buf,prev,tag,count,statuses, TEST_SIZE,"isend and irecv");
  }
  else {
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check(recv_buf,prev,tag,count,&status, TEST_SIZE,"isend and irecv"); init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Isend(recv_buf, count, MPI_DOUBLE, next, tag,MPI_COMM_WORLD,
              (requests));
    MPI_Wait((requests), &status);
    rq_check(requests, 1, "isend (and recv)");
  }



  VT_END_REGION( nblock_sends );

/*
 * Nonblocking ready sends
 */


  VT_BEGIN_REGION( nblock_rsends );

  if (rank == 0)
    printf ("Irsend\n");

  tag = 2456;
  count = TEST_SIZE / 3;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
    MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
              MPI_COMM_WORLD, requests);
    init_test_data(send_buf,TEST_SIZE,0);
    MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, next, 0,
                  MPI_BOTTOM, 0, MPI_INT, next, 0,
                  dupcom, &status);
    MPI_Irsend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD,
               (requests+1));
    reqcount = 0;
    while (reqcount != 2) {
      MPI_Waitany( 2, requests, &index, statuses);
      if( index == 0 ) {
	  memcpy( &status, statuses, sizeof(status) );
      }
      reqcount++;
    }

    rq_check( requests, 1, "irsend and irecv");
    msg_check(recv_buf,prev,tag,count,&status, TEST_SIZE,"irsend and irecv");
  }
  else {
    MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
              MPI_COMM_WORLD, requests);
    MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, next, 0,
                  MPI_BOTTOM, 0, MPI_INT, next, 0,
                  dupcom, &status);
    flag = 0;
    while (!flag)
      MPI_Test(requests, &flag, &status);

    rq_check( requests, 1, "irsend and irecv (test)");
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "irsend and irecv"); init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Irsend(recv_buf, count, MPI_DOUBLE, next, tag,
               MPI_COMM_WORLD, requests);
    MPI_Waitall(1, requests, statuses);
    rq_check( requests, 1, "irsend and irecv");
  }

  VT_END_REGION( nblock_rsends );

/*
 * Nonblocking synchronous sends
 */

  VT_BEGIN_REGION( nblock_ssends );

  if (rank == 0)
    printf ("Issend\n");

  tag = 2789;
  count = TEST_SIZE / 3;
  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
    MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
              MPI_COMM_WORLD, requests );
    init_test_data(send_buf,TEST_SIZE,0);
    MPI_Issend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD,
               (requests+1));
    flag = 0;
    while (!flag)
      MPI_Testall(2, requests, &flag, statuses);

    rq_check( requests, 2, "issend and irecv (testall)");
    msg_check( recv_buf, prev, tag, count, statuses, TEST_SIZE, 
               "issend and recv");
  }
  else {
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
             MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "issend and recv"); init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Issend(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD,requests);

    flag = 0;
    while (!flag)
      MPI_Testany(1, requests, &index, &flag, statuses);

    rq_check( requests, 1, "issend and recv (testany)");
  }


  VT_END_REGION( nblock_ssends );


/*
 * Persistent normal sends
 */

  VT_BEGIN_REGION( pers_sends );

  if (rank == 0)
    printf ("Send_init\n");

  tag = 3123;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  MPI_Send_init(send_buf, count, MPI_DOUBLE, next, tag,
                MPI_COMM_WORLD, requests);
  MPI_Recv_init(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
                MPI_COMM_WORLD, (requests+1));

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);
    MPI_Startall(2, requests);
    MPI_Waitall(2, requests, statuses);
    msg_check( recv_buf, prev, tag, count, (statuses+1),
               TEST_SIZE, "persistent send/recv");
  }
  else {
    MPI_Start((requests+1));
    MPI_Wait((requests+1), &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "persistent send/recv");
    init_test_data(send_buf,TEST_SIZE,1);


    MPI_Start(requests);
    MPI_Wait(requests, &status);
  }
  MPI_Request_free(requests);
  MPI_Request_free((requests+1));


  VT_END_REGION( pers_sends );

/*
 * Persistent ready sends
 */

  VT_BEGIN_REGION( pers_rsends );

  if (rank == 0)
    printf ("Rsend_init\n");

  tag = 3456;
  count = TEST_SIZE / 3;

  clear_test_data(recv_buf,TEST_SIZE);

  MPI_Rsend_init(send_buf, count, MPI_DOUBLE, next, tag,
                  MPI_COMM_WORLD, requests);
  MPI_Recv_init(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
                 MPI_ANY_TAG, MPI_COMM_WORLD, (requests+1));

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0); MPI_Barrier( MPI_COMM_WORLD );
    MPI_Startall(2, requests);
    reqcount = 0;
    while (reqcount != 2) {
      MPI_Waitsome(2, requests, &outcount, indices, statuses);
      for (i=0; i<outcount; i++) {
        if (indices[i] == 1) {
          msg_check( recv_buf, prev, tag, count, (statuses+i),
                     TEST_SIZE, "waitsome");
        }
	reqcount++;
      }
    }
  }
  else {
    MPI_Start((requests+1)); MPI_Barrier( MPI_COMM_WORLD );
    flag = 0;
    while (!flag)
      MPI_Test((requests+1), &flag, &status);

    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "test");

    init_test_data(send_buf,TEST_SIZE,1);

 
    MPI_Start(requests);
    MPI_Wait(requests, &status);
  }
  MPI_Request_free(requests);
  MPI_Request_free((requests+1));


  VT_END_REGION( pers_rsends );


/*
 * Persistent synchronous sends
 */


  VT_BEGIN_REGION( pers_ssends );

  if (rank == 0)
    printf ("Ssend_init\n");

  tag = 3789;
  count = TEST_SIZE / 3;

  clear_test_data(recv_buf,TEST_SIZE);

  MPI_Ssend_init(send_buf, count, MPI_DOUBLE, next, tag,
                 MPI_COMM_WORLD, (requests+1));
  MPI_Recv_init(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
                 MPI_ANY_TAG, MPI_COMM_WORLD, requests);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);
    MPI_Startall(2, requests);

    reqcount = 0;
    while (reqcount != 2) {
      MPI_Testsome(2, requests, &outcount, indices, statuses);
      for (i=0; i<outcount; i++) {
        if (indices[i] == 0) {
          msg_check( recv_buf, prev, tag, count, (statuses+i),
                     TEST_SIZE, "testsome");
        }
	reqcount++;
      }
    }
  }
  else {
    MPI_Start(requests);
    flag = 0;
    while (!flag)
      MPI_Testany(1, requests, &index, &flag, statuses);

    msg_check( recv_buf, prev, tag, count, statuses, TEST_SIZE, "testany" );

    init_test_data(send_buf,TEST_SIZE,1);


     MPI_Start((requests+1));
     MPI_Wait((requests+1), &status);
  }
  MPI_Request_free(requests);
  MPI_Request_free((requests+1));


  VT_END_REGION( pers_ssends );


/*
 * Send/receive.
 */


  VT_BEGIN_REGION( sendrecv );

  if (rank == 0)
    printf ("Sendrecv\n");

  tag = 4123;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);
    MPI_Sendrecv(send_buf, count, MPI_DOUBLE, next, tag,
                 recv_buf, count, MPI_DOUBLE, prev, tag,
                 MPI_COMM_WORLD, &status );

    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "sendrecv");
  }
  else {
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
             MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "recv/send"); init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
  }


  VT_END_REGION( sendrecv );

#ifdef V_T
  VT_flush();
#endif


/*
 * Send/receive replace.
 */

  VT_BEGIN_REGION( sendrecv_repl );

  if (rank == 0)
    printf ("Sendrecv_replace\n");

  tag = 4456;
  count = TEST_SIZE / 3;

  if (rank == 0) {
      init_test_data(recv_buf, TEST_SIZE,0);
    for (i=count; i< TEST_SIZE; i++)
      recv_buf[i] = 0.0;

    MPI_Sendrecv_replace(recv_buf, count, MPI_DOUBLE,
                         next, tag, prev, tag, MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "sendrecvreplace");
  }
  else {
    clear_test_data(recv_buf,TEST_SIZE);
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
             MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,
               "recv/send for replace"); init_test_data(recv_buf,TEST_SIZE,1);
    MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD);
  }

  VT_END_REGION( sendrecv_repl );


/*
 * Send/Receive via inter-communicator
 */

  VT_BEGIN_REGION( intercomm );

  MPI_Intercomm_create(MPI_COMM_SELF, 0, MPI_COMM_WORLD, next, 1, &intercom);

  if (rank == 0)
    printf ("Send via inter-communicator\n");

  tag = 4018;
  count = TEST_SIZE / 5;

  clear_test_data(recv_buf,TEST_SIZE);

  if (rank == 0) {
      init_test_data(send_buf,TEST_SIZE,0);

    LOCDEF();

    MPI_Send(send_buf, count, MPI_DOUBLE, 0, tag, intercom);
    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE,
              MPI_ANY_TAG, intercom, &status);
    msg_check(recv_buf, 0, tag, count, &status, TEST_SIZE, "send and recv via inter-communicator");
  }
  else if (rank == 1) {

    LOCDEF();

    MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG,
             intercom, &status);
    msg_check( recv_buf, 0, tag, count, &status, TEST_SIZE,"send and recv via inter-communicator");
    init_test_data(recv_buf,TEST_SIZE,0);
    MPI_Send(recv_buf, count, MPI_DOUBLE, 0, tag, intercom);

  }

  VT_END_REGION( normal_sends );



  MPI_Comm_free(&intercom);
  MPI_Comm_free(&dupcom);
} 
Example #29
0
int main (int argc, char **argv) {
	FILE *fp;
	double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL;
	double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL;
	int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size;
	int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size;
	int rank, size, sqrt_size, matrices_a_b_dimensions[4];
	MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator;
	MPI_Status status; 
	
	double reading_time, dimensions_time, scatter_time, gather_time, writing_time;
	

	// used to manage the cartesian grid
	int dimensions[2], periods[2], coordinates[2], remain_dims[2];

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* For square mesh */
	sqrt_size = (int)sqrt((double) size);             
	if(sqrt_size * sqrt_size != size){
		if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n");
		MPI_Abort(MPI_COMM_WORLD, -1);
	}

	// create a 2D cartesian grid 
	dimensions[0] = dimensions[1] = sqrt_size;
	periods[0] = periods[1] = 1;    
	MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator);
	MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates);

	// create a row communicator
	remain_dims[0] = 0;            
	remain_dims[1] = 1; 
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator);

	// create a column communicator
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator);

	//Start I/O reading counter!
	reading_time = MPI_Wtime();

	// getting matrices from files at rank 0 only
	// example: mpiexec -n 64 ./cannon matrix1 matrix2 [test]
	if (rank == 0){
		int row, column;
		if ((fp = fopen (argv[1], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]);
			A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *));
			for (row = 0; row < matrices_a_b_dimensions[0]; row++){
				A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double));
				for (column = 0; column < matrices_a_b_dimensions[1]; column++)
					fscanf(fp, "%lf", &A[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
		if((fp = fopen (argv[2], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]);
			B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *));
			for(row = 0; row < matrices_a_b_dimensions[2]; row++){
				B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *));
				for(column = 0; column < matrices_a_b_dimensions[3]; column++)
					fscanf(fp, "%lf", &B[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// need to check that the multiplication is possible given dimensions 
		// matrices_a_b_dimensions[0] = row size of A
		// matrices_a_b_dimensions[1] = column size of A
		// matrices_a_b_dimensions[2] = row size of B
		// matrices_a_b_dimensions[3] = column size of B
		if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){
			if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", 
					matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// this implementation is limited to cases where thematrices can be partitioned perfectly
		if( matrices_a_b_dimensions[0] % sqrt_size != 0 
				|| matrices_a_b_dimensions[1] % sqrt_size != 0 
				|| matrices_a_b_dimensions[2] % sqrt_size != 0 
				|| matrices_a_b_dimensions[3] % sqrt_size != 0 ){
			if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n"
					"all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n",
					matrices_a_b_dimensions[0],matrices_a_b_dimensions[1],
					matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size );
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
	}
	
	//stop I/O reading counter
	reading_time = MPI_Wtime() - reading_time;
	
	//Start dimensions sending counter
	dimensions_time = MPI_Wtime();

	// send dimensions to all peers
	if(rank == 0) {
		int i;
		for(i = 1; i < size; i++){
			MPI_Send(matrices_a_b_dimensions, 4, MPI_INT, i, 0, cartesian_grid_communicator);
		}
	} else {
		MPI_Recv(matrices_a_b_dimensions, 4, MPI_INT, 0, 0, cartesian_grid_communicator, &status);
	}
	
	//stop dimensions sending counter
	dimensions_time = MPI_Wtime() - dimensions_time;

	A_rows = matrices_a_b_dimensions[0];
	A_columns = matrices_a_b_dimensions[1];
	B_rows = matrices_a_b_dimensions[2];
	B_columns = matrices_a_b_dimensions[3];

	// local metadata for A
	A_local_block_rows = A_rows / sqrt_size;
	A_local_block_columns = A_columns / sqrt_size;
	A_local_block_size = A_local_block_rows * A_local_block_columns;
	A_local_block = (double *) malloc (A_local_block_size * sizeof(double));

	// local metadata for B
	B_local_block_rows = B_rows / sqrt_size;
	B_local_block_columns = B_columns / sqrt_size;
	B_local_block_size = B_local_block_rows * B_local_block_columns;
	B_local_block = (double *) malloc (B_local_block_size * sizeof(double));

	// local metadata for C
	C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double));
	// C needs to be initialized at 0 (accumulates partial dot-products)
	int i;
	for(i=0; i < A_local_block_rows * B_local_block_columns; i++){
		C_local_block[i] = 0;
	}
	
	//Start data scattering counter
	scatter_time = MPI_Wtime();

	// full arrays only needed at root
	if(rank == 0){
		A_array = (double *) malloc(sizeof(double) * A_rows * A_columns);
		B_array = (double *) malloc(sizeof(double) * B_rows * B_columns);
		C_array = (double *) malloc(sizeof(double) * A_rows * B_columns);
		// generate the 1D arrays of the matrices at root
		int row, column, i, j;
		for (i = 0; i < sqrt_size; i++){
			for (j = 0; j < sqrt_size; j++){
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < A_local_block_columns; column++){
						A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] 
							= A[i * A_local_block_rows + row][j * A_local_block_columns + column];
					}
				}
				for (row = 0; row < B_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] 
							= B[i * B_local_block_rows + row][j * B_local_block_columns + column];
					}
				}
			}
		}
		// allocate output matrix C
		C = (double **) malloc(A_rows * sizeof(double *));
		for(i=0; i<A_rows ;i++){
			C[i] = (double *) malloc(B_columns * sizeof(double));
		}
	} 

	// send a block to each process
	if(rank == 0) {
		int i;
		for(i = 1; i < size; i++){
			MPI_Send((A_array + (i * A_local_block_size)), A_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator);
			MPI_Send((B_array + (i * B_local_block_size)), B_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator);
		}
		for(i = 0; i < A_local_block_size; i++){
			A_local_block[i] = A_array[i];
		}
		for(i = 0; i < B_local_block_size; i++){
			B_local_block[i] = B_array[i];
		}
	} else {
		MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status);
		MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status);
	}

	// fix initial arrangements before the core algorithm starts
	if(coordinates[0] != 0){
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, 
				(coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0, 
				(coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status);
	}
	if(coordinates[1] != 0){
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, 
				(coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0, 
				(coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status);
	}
	
	//stop data scattering counter
	scatter_time = MPI_Wtime() - scatter_time;

	// cannon's algorithm
	int cannon_block_cycle;
	double compute_time = 0, mpi_time = 0, start;
	int C_index, A_row, A_column, B_column;
	for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){
		// compute partial result for this block cycle
		start = MPI_Wtime();
		for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){
			for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){
				for(A_column = 0; A_column < A_local_block_columns; A_column++){
					C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] *
						B_local_block[A_column * B_local_block_columns + B_column];
				}
			}
		}
		compute_time += MPI_Wtime() - start;
		start = MPI_Wtime();
		// rotate blocks horizontally
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, 
				(coordinates[1] + sqrt_size - 1) % sqrt_size, 0, 
				(coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status);
		// rotate blocks vertically
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, 
				(coordinates[0] + sqrt_size - 1) % sqrt_size, 0, 
				(coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status);
		mpi_time += MPI_Wtime() - start;
	}
	
	//Start data gathering counter
	gather_time = MPI_Wtime();
	
	// get C parts from other processes at rank 0
	if(rank == 0) {
		for(i = 0; i < A_local_block_rows * B_local_block_columns; i++){
			C_array[i] = C_local_block[i];
		}
		int i;
		for(i = 1; i < size; i++){
			MPI_Recv(C_array + (i * A_local_block_rows * B_local_block_columns), A_local_block_rows * B_local_block_columns, 
				MPI_DOUBLE, i, 0, cartesian_grid_communicator, &status);
		}
	} else {
		MPI_Send(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, 0, cartesian_grid_communicator);
	}
	
	//stop data gathering counter
	gather_time = MPI_Wtime() - gather_time;
	
	

	// generating output at rank 0
	if (rank == 0) {
		
		//Start I/O writing counter
		writing_time = MPI_Wtime();
		
		// convert the ID array into the actual C matrix 
		int i, j, k, row, column;
		
		char output_filename[50];
		sprintf(output_filename,"output%dx%d_%d.out",A_rows,B_columns,atoi(argv[3]));
		FILE *fp;
		if((fp = fopen(output_filename, "wb")) == NULL) { perror("File cannot be opened"); exit(1); }

		fwrite(C_array, sizeof(double), A_rows*B_columns, fp);

		fclose(fp);
		//stop I/O writing counter
		writing_time = MPI_Wtime() - writing_time;
		
		
		//Print metrics
		printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns);
		printf("Computation time: %lf\n", compute_time);
		printf("MPI time:         %lf\n", mpi_time);
		printf("Reading time:     %lf\n", reading_time);
		printf("Dimensions time:  %lf\n", dimensions_time);
		printf("Scattering time:  %lf\n", scatter_time);
		printf("Gathering time:   %lf\n", gather_time);
		printf("Writing time:     %lf\n", writing_time);
		
		printf("Total non-computational MPI time: %lf\n", dimensions_time + scatter_time + gather_time);
		printf("Total IO time:    %lf\n", reading_time + writing_time);

		if (argc == 5){
			// present results on the screen
			printf("\nA( %d x %d ):\n", A_rows, A_columns);
			for(row = 0; row < A_rows; row++) {
				for(column = 0; column < A_columns; column++)
					printf ("%7.3f ", A[row][column]);
				printf ("\n");
			}
			printf("\nB( %d x %d ):\n", B_rows, B_columns);
			for(row = 0; row < B_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ", B[row][column]);
				printf("\n");
			}
			printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns);
			for(row = 0; row < A_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ",C[row][column]);
				printf("\n");
			}


			printf("\nPerforming serial consistency check. Be patient...\n");
			fflush(stdout);
			int pass = 1;
			double temp;
			for(i=0; i<A_rows; i++){
				for(j=0; j<B_columns; j++){
					temp = 0;
					for(k=0; k<B_rows; k++){
						temp += A[i][k] * B[k][j];
					}
					printf("%7.3f ", temp);
					if(temp != C[i][j]){
						pass = 0;
					}
				}
				printf("\n");
			}
			if (pass) printf("Consistency check: PASS\n");
			else printf("Consistency check: FAIL\n");
		}	
	}



	// free all memory
	if(rank == 0){
		int i;
		for(i = 0; i < A_rows; i++){
			free(A[i]);
		}
		for(i = 0; i < B_rows; i++){
			free(B[i]);
		}
		for(i = 0; i < A_rows; i++){
			free(C[i]);
		}
		free(A);
		free(B);
		free(C);
		free(A_array);
		free(B_array);
		free(C_array);
	}
	free(A_local_block);
	free(B_local_block);
	free(C_local_block);

	// finalize MPI
	MPI_Finalize();
}
Example #30
0
int main(int argc, char **argv) {

    int rank, num_tasks;

    /* Initialize MPI */
#if USE_MPI
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    // printf("Hello world from rank %3d of %3d\n", rank, num_tasks);
#else
    rank = 0;
    num_tasks = 1;
#endif

    if (argc != 2) {
        if (rank == 0) {
            fprintf(stderr, "%s <n>\n", argv[0]);
            fprintf(stderr, "Program for parallel dense matrix-matrix multiplication\n");
            fprintf(stderr, "with 1D row partitioning\n");
            fprintf(stderr, "<n>: matrix dimension (an nxn dense matrix is created)\n");
#if USE_MPI
            MPI_Abort(MPI_COMM_WORLD, 1);
#else
            exit(1);
#endif
        }
    }

    int n;

    n = atoi(argv[1]);
    assert(n > 0);
    assert(n < 10000);

    /* ensure that n is a multiple of num_tasks */
    n = (n/num_tasks) * num_tasks;
    
    int n_p = (n/num_tasks);

    /* print new n to let user know n has been modified */
    if (rank == 0) {
        fprintf(stderr, "n: %d, n_p: %d, num_tasks: %d\n", n, n_p, num_tasks);
        fprintf(stderr, "Requires %3.6lf MB of memory per task\n", ((3*4.0*n_p)*n/1e6));
    }

    float *A, *B, *C;
    
    A = (float *) malloc(n_p * n * sizeof(float));
    assert(A != 0);

    B = (float *) malloc(n_p * n * sizeof(float));
    assert(A != 0);
    
    C = (float *) malloc(n_p * n * sizeof(float));
    assert(C != 0);

    /* linearized matrices in row-major storage */
    /* A[i][j] would be A[i*n+j] */

    int i, j;

    /* static initalization, so that we can verify output */
    /* using very simple initialization right now */
    /* this isn't a good check for parallel debugging */
#ifdef _OPENMP
#pragma omp parallel for private(i,j)
#endif
    for (i=0; i<n_p; i++) {
        for (j=0; j<n; j++) {
            A[i*n+j] = (rank+1);
            B[i*n+j] = 1;
            C[i*n+j] = 0;
        }
    }

#if USE_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

    double elt = 0.0, commStart = 0.0, commEnd = 0.0, commTime = 0.0, totalCommTime = 0.0, tempTime;
    if (rank == 0) 
        elt = timer();

#if USE_MPI
    /* Parallel matmul code goes here, see lecture slides for idea */
    /* The matrix C should be updated correctly */

    // precalculate some variables to use in the loop
    int dest=rank+1, src=rank-1, numElemPerProc = n_p * n;

    // mark the beginning of column indices
    int colStart = rank * n_p;
    int nr, k;
    for(nr = 0; nr < num_tasks; nr++)
    {
        // wrap around colStart
        if(colStart < 0)
            colStart = n - n_p;

        // do the actual matrix multiplication on each proc's data
        for(i = 0; i < n_p; i++)
        {
            int iC = i * n;
            int iA = iC + colStart;
            for(j = 0; j < n; j++)
            {
                float result = 0;
                for(k = 0; k < n_p; k++)
                    result += A[iA + k] * B[k*n + j];

                // attempt at loop unrolling with re-association
                //for(k = 0; k < n_p; k+=2)
                //    result += (A[iA + k] * B[k*n + j]) + (A[iA + k + 1] * B[(k+1)*n + j]);

                //// include the remaining elements now
                //for(; k < n_p; k++)
                //    result += (A[iA + k] * B[k*n + j]);

                C[iC + j] += result;
            } // end of j loop
        } // end of i loop

        // wrap around indices in case of edge conditions
        if(rank == 0)
            src = num_tasks - 1;
        else if (rank == num_tasks - 1)
            dest = 0;

        commStart = timer();
        // use SendRecv replace to perform a Send first to higher rank proc
        // and then Receive from a lower rank proc
        // Cyclic transfers of chunks of B's data
        MPI_Sendrecv_replace(B, numElemPerProc, MPI_FLOAT,
                             dest,  123,
                             src,   123,
                             MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        commEnd = timer();
        commTime = commEnd - commStart;

        // calculate the max comm time since that will be the limiter?
        MPI_Reduce(&commTime, &tempTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

        // only the root process accumulates the total communication time
        if(!rank)
            totalCommTime += tempTime;

        // adjust colStart since we have moved on to the
        // previous row, so colStart needs to be decremented
        // and wrapped around
        colStart -= n_p;
    } // end of nr for loop

#else
    int k;
#ifdef _OPENMP
#pragma omp parallel for private(i,j,k)
#endif
    for (i=0; i<n_p; i++) {
        for (j=0; j<n; j++) {
            float c_ij = 0;
            for (k=0; k<n; k++) {
                c_ij += A[i*n+k]*B[k*n+j];
            }
            C[i*n+j] = c_ij;
        }
    }
#endif

    if (rank == 0) 
        elt = timer() - elt;

    /* Verify */
    int verify_failed = 0;
    for (i=0; i<n_p; i++) {
        for (j=0; j<n; j++) {
            if (C[i*n+j] != ((rank+1)*n))
                verify_failed = 1;
        }
    }

    if (verify_failed) {
        fprintf(stderr, "ERROR: rank %d, verification failed, exiting!\n", rank);
#if USE_MPI
        MPI_Abort(MPI_COMM_WORLD, 2);
#else
        exit(2);
#endif
    }

    if (rank == 0) {
        fprintf(stderr, "Time taken: %3.3lf s.\n", elt);
        fprintf(stderr, "Performance: %3.3lf GFlop/s\n", (2.0*n*n)*n/(elt*1e9));

        fprintf(stderr, "Communication time: %3.3lf s.\n", totalCommTime);
        fprintf(stderr, "Computation time must be: %3.3lf s.\n", elt - totalCommTime);
    }

    /* free memory */
    free(A); free(B); free(C);

    /* Shut down MPI */
#if USE_MPI
    MPI_Finalize();
#endif

    return 0;
}