unsigned int gol(unsigned char *grid, unsigned int dim_x, unsigned int dim_y, unsigned int time_steps)
{
	// READ ME! Parallelize this function to work with MPI. It must work even with a single processor.
	// We expect you to use MPI_Scatterv, MPI_Gatherv, and MPI_Sendrecv to achieve this.
	// MPI_Scatterv/Gatherv are checked to equal np times, and MPI_Sendrecv is expected to equal 2 * np * timesteps
	// That is, top+bottom ghost cells * all processors must execute this command * Sendrecv executed every timestep.

	int np, rank, quo, rem;
    MPI_Comm_size(MPI_COMM_WORLD, &np);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    // Allocate length and displacements array
    quo = (dim_y/np)*dim_x;
    rem = (dim_y%np)*dim_x;
    int *length = (int *) calloc(np, sizeof(int));
    int *disps = (int *) calloc(np, sizeof(int));

    // Fill corresponding arrays
    for(int i = 0; i < np - 1; i++)
    {
        disps[i + 1] = disps[i] + quo;
        length[i] = disps[i + 1] - disps[i];
    }
    length[np - 1] = quo + rem;

    // Grids allocation
	unsigned char *loc_grid_in, *loc_grid_tmp, *loc_grid_out;
    loc_grid_in = (unsigned char *) calloc(sizeof(unsigned char), length[rank] + 2*dim_x);
    loc_grid_tmp = (unsigned char *) calloc(sizeof(unsigned char), length[rank] + 2*dim_x);
	if (loc_grid_tmp == NULL)
		exit(EXIT_FAILURE);

	// Distribute parts of grid to other processors
    MPI_Scatterv(grid, length, disps, MPI_UNSIGNED_CHAR,
    			 loc_grid_in + dim_x, length[rank], MPI_UNSIGNED_CHAR,
    			 0,
    			 MPI_COMM_WORLD);

	loc_grid_out = loc_grid_tmp;
	int loc_dim_y = length[rank]/dim_x;
	int frw = (rank + 1 + np) % np;
	int backw = (rank - 1 + np) % np;

	for (int t = 0; t < time_steps; ++t)
	{
		// Forward sendrecv
		MPI_Sendrecv(loc_grid_in + length[rank], dim_x, MPI_UNSIGNED_CHAR, frw, 1,
					 loc_grid_in, dim_x, MPI_UNSIGNED_CHAR, backw, 1,
    				 MPI_COMM_WORLD, MPI_STATUS_IGNORE);

		// Backward sendrecv
		MPI_Sendrecv(loc_grid_in + dim_x, dim_x, MPI_UNSIGNED_CHAR, backw, 0,
					 loc_grid_in + dim_x + length[rank], dim_x, MPI_UNSIGNED_CHAR, frw, 0,
    				 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		
		for (int y = 1; y < 1 + loc_dim_y; ++y)
		{
			for (int x = 0; x < dim_x; ++x)
			{
				evolve(loc_grid_in, loc_grid_out, dim_x, loc_dim_y + 2, x, y);
			}
		}
		swap((void**)&loc_grid_in, (void**)&loc_grid_out);
	}

	MPI_Gatherv(loc_grid_in + dim_x, length[rank], MPI_UNSIGNED_CHAR,
   				grid, length, disps, MPI_UNSIGNED_CHAR,
   				0,
   				MPI_COMM_WORLD);

	free(loc_grid_in);
	free(loc_grid_out);
	free(disps);
	free(length);
	
	if (rank == 0)
		return cells_alive(grid, dim_x, dim_y);
	else
		return 0;
}
Exemple #2
0
main()
{       printf("Program start\n");
	
	FILE *fp;
	clock_t begin,end;
	double time_spent;
	int NPROC, rank, root, colindex, link,i,j=0, k, col,colmatch=0,localsum=0;	
        int newlines = 0, linenum = 5;
	char ch;
	char line[21];
	double * val = (double*)calloc(EDGES, sizeof(double));
//        double val[EDGES];
	int * rowind =(int*)calloc(EDGES, sizeof(int));
//	int rowind[EDGES];
	int *sendcnts;
	int *displs;
	int * colptr = (int*)calloc(NODES+1, sizeof(int)); //missing  values will be initialized to zero? no.of cols+1
	int co, index; //for normalizing the array of non zero elements
	
	double * pr = (double*)malloc(NODES*sizeof(double));; //malloc and initialize to 0.25
	double * prnew = (double*)calloc(NODES, sizeof(double));
	double * damp1 = (double*)malloc(NODES*sizeof(double)); //malloc and initialize to 0.85
	double * damp2 = (double*)malloc(NODES*sizeof(double)); //malloc and initialize to 0.15/NODES
	double * diff = (double*)calloc(NODES, sizeof(double));
        double * sum = (double*)calloc(NODES, sizeof(double)); //this is for the column vectors
	double * rec_val = (double*)malloc(100000*sizeof(double));
        double * rec_pr = (double*)malloc(100000*sizeof(double));
//        double rec_val[4000], rec_pr[4000]; //these receive the scattered vector parts in the processes
	double err = 0.00001;
	double norm, norm_sq;
	int * readsum = (int*)calloc(NODES, sizeof(int));
        int rec_col;
	int * rec_row = (int*)malloc(100000*sizeof(int));
        MPI_Init(NULL, NULL);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &NPROC);
	printf("Number of processes %d\n", NPROC);
        sendcnts= malloc(sizeof(int)*NPROC);
        displs = malloc(sizeof(int)*NPROC);
        int * pcols = (int*)malloc(NPROC*sizeof(int)); //No.of columns to each processor
//	int pcols[NPROC];
	int * displs_pr = (int*)malloc(NPROC*sizeof(int));// displacement for pagerank vector scatter

	for(i=0; i<NODES;i++) {
		pr[i] = 0.25;
		damp1[i] = 0.85;
		damp2[i] = 0.15/NODES;
	}
	printf("initialization complete\n");
	printf(" I am rank %d\n", rank);
//1. Populate val, rowind, colptr
//2. Calculate number of columns to each process
//3. Number of pagerank vector elements to each process
//4. Distribution of non zero elements to corresponding processes

        fp = fopen("data1.dat", "r");
	while((ch=getc(fp)) != EOF) {
	        if(ch == '\n') {
		        newlines += 1;
			if(newlines == linenum - 1) {
			        break;
			}
		}
	}	
	for(i = 0; i<EDGES; i++) {
		fscanf(fp, "%d %d", &colindex, &link);
//Uncomment the next two lines if node numbers does not start with zero		
         	colindex = colindex - 1;
		link = link - 1;
		rowind[i] = link;
		if(colmatch==colindex) {
			localsum += 1;
		}
		else {
			readsum[j] = localsum;
			colptr[j+1] = colptr[j] + localsum; //index of val where new column starts
			localsum = 1; //new localsum
			j += 1;
			colmatch = colindex; //new column
		}
		val[i] = 1.0;
	}
	readsum[j] = localsum; //for the last column
	colptr[j+1]= EDGES; //number of non zeros in the matrix
	fclose(fp);
	index = 0;
	for(i = 0; i<NODES; i++) { // This is to normalize the array of non zeros
		co = readsum[i];
		for(j = index; j < index+co; j++) {
			val[j] = val[j]/co;
		}
		index += co;
	}
	printf("val, rowind and colptr have been populated\n");
// val, rowind, colptr calculation complete... all the above should
// go to mpi file..

// Calculate number of columns to each process

        for(i=0; i<NPROC; i++) {
		if(i==0) {
			pcols[i] = NODES/NPROC + NODES%NPROC;
			displs_pr[i] = 0;
		}else {
			pcols[i] = NODES/NPROC;
			displs_pr[i] = pcols[i-1] + displs_pr[i-1];
		}
	}
// Calculating sendcnts and displs
        j = 0;
	for(i=0; i<NPROC; i++) {
		j = j + pcols[i];
		k = j - pcols[i];
		sendcnts[i] = colptr[j] - colptr[k];
		if (i==0) {
			displs[i] = 0;
		}else
			displs[i] = sendcnts[i-1] + displs[i-1];
	}
	if(rank==MASTER) {
	        printf("This is MASTER\n");
		
	        printf("\nsendcnts\n");
		for(i=0;i<NPROC;i++) {
		         printf("%d\t", sendcnts[i]);
	        }
	        printf("\ndispls\n");
	        for(i=0;i<NPROC;i++) {
		         printf("%d\t", displs[i]);
	        }	
	        printf("\nval\n");
//	        for(i=0;i<EDGES; i++) {
//	                 printf("%f\t", val[i]);
//	        }
	}
//	double val[] = {0.25,0.25,0.25,0.25,0.5,0.5,1.0,0.5,0.5};
//	int sendcnts[] = {6,3};
//	int displs[] = {0,6};
	MPI_Scatterv(val, sendcnts, displs, MPI_DOUBLE, rec_val, sendcnts[rank], MPI_DOUBLE, 0, MPI_COMM_WORLD);//non-zero elements
	printf("first scatterv completed\n");
	MPI_Scatter(pcols, 1, MPI_INT, &rec_col, 1, MPI_INT, 0, MPI_COMM_WORLD);//number of columns in each processor
	printf("scatter completed\n");
	MPI_Scatterv(rowind, sendcnts, displs, MPI_INT, rec_row, sendcnts[rank], MPI_INT, 0, MPI_COMM_WORLD);//rowindices 	
	printf("second scatterv completed\n");
	double *vec[rec_col];// Initializing vector columns
	for(i=0; i<rec_col; i++) {
		vec[i] = (double *)calloc(NODES,sizeof(double));
	}
	k=0;
	for(j=0;j<rec_col;j++) {// Splitting rec_val to columnvectors
		for(i=0;i<NODES;i++) {
			if(i==rec_row[k]) {
				vec[j][i] = rec_val[k];//these vectors don't change with iterations
				k+=1;
			}
		}
	}
	begin = clock();
	do //Only MASTER contains updated prnew and pr.. each process has its own sum
	{  // norm is calculated by MASTER but is distributed to all for loop continuity
		memset(sum, 0, NODES*sizeof(double));
		if(rank == MASTER) {
		        memset(prnew, 0, NODES*sizeof(double));
		}	
		norm = 0.0;
		//Scatter and multiply
                MPI_Scatterv(pr, pcols, displs_pr, MPI_DOUBLE, rec_pr, pcols[rank], MPI_DOUBLE, 0, MPI_COMM_WORLD);
		for(i=0;i<NODES;i++) {
			for(j=0;j<rec_col;j++) {
				sum[i] += vec[j][i]*rec_pr[j];// sum of column multiplications
			}
		}
		MPI_Reduce(sum, prnew, NODES, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);//vector sums into master
		// Normalizing
		if(rank == MASTER) {
		        for(i=0;i<NODES;i++) {
			        prnew[i]= prnew[i]*damp1[i] + damp2[i];
			}
			norm_sq = 0.0;
			for(i=0; i<NODES; i++) {// for norm calculation
			        diff[i] = prnew[i] - pr[i];
				norm_sq += diff[i]*diff[i];
				pr[i] = prnew[i];
			}
			norm = sqrt(norm_sq);
		}
		MPI_Bcast(&norm, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
		//if(rank==MASTER) {
		//printf("Reduced page rank vector:\n");
		//for(i=0;i<NODES;i++) {
		//	printf("%f\t", prnew[i]);
		//}
	}while(norm>err);
	end = clock();
	time_spent = (double)(end-begin)/CLOCKS_PER_SEC;
	if(rank==MASTER) {
	        printf("\npagerank vector first ten elements\n");
		for(i = 0; i<10; i++) {
		        printf("%f\t", prnew[i]);
		}
		printf("\n");
		printf("Time taken for power iteration solution %fseconds\n",time_spent);
	}
	MPI_Finalize();

}
Exemple #3
0
int main (int argc, char ** argv) {
   int taskid, ntasks;

    int xsize, ysize, colmax;
    pixel src[MAX_PIXELS];
    double w[MAX_RAD];

    struct timespec stime, etime;
    struct timespec tstime, tetime;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
    MPI_Comm_size(MPI_COMM_WORLD, &ntasks);

    // Create a custom MPI datatype for pixel
    pixel item;
    MPI_Datatype pixel_mpi;
    MPI_Datatype type[3] = { MPI_UNSIGNED_CHAR, MPI_UNSIGNED_CHAR, MPI_UNSIGNED_CHAR };
    int blocklen[] = { 1, 1, 1 };
    MPI_Aint start, disp[3];

    MPI_Address( &item, &start );
    MPI_Address( &item.r, &disp[0] );
    MPI_Address( &item.g, &disp[1] );
    MPI_Address( &item.b, &disp[2] );

    disp[0] -= start;
    disp[1] -= start;
    disp[2] -= start;

    MPI_Type_struct(3, blocklen, disp, type, &pixel_mpi);
    MPI_Type_commit(&pixel_mpi);

    int buffsize, radius, startY, endY;

    /* Take care of the arguments */
    if (argc != 4) {
        fprintf(stderr, "Usage: %s radius infile outfile\n", argv[0]);
        exit(1);
    }
    radius = atoi(argv[1]);
    if((radius > MAX_RAD) || (radius < 1)) {
        fprintf(stderr, "Radius (%d) must be greater than zero and less then %d\n", radius, MAX_RAD);
        exit(1);
    }

    if (taskid == ROOT) {
        /* read file */
        if(read_ppm (argv[2], &xsize, &ysize, &colmax, (char *) src) != 0)
            exit(1);

        if (colmax > 255) {
            fprintf(stderr, "Too large maximum color-component value\n");
            exit(1);
        }

        /* filter */
        printf("Has read the image, generating coefficients\n");
        get_gauss_weights(radius, w);
    }

    // Broadcast the gaussian weight vector
    MPI_Bcast(w, MAX_RAD, MPI_DOUBLE, ROOT, MPI_COMM_WORLD);
    // Broadcast image dimensions
    MPI_Bcast(&xsize, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
    MPI_Bcast(&ysize, 1, MPI_INT, ROOT, MPI_COMM_WORLD);

    // Calculate chunk size
    buffsize = ceil((float)ysize / (float)ntasks) * xsize;
    pixel recvbuff[MAX_PIXELS];

    int sendcnts[ntasks], displs[ntasks], result_write_starts[ntasks], recievecounts[ntasks];
    int i;
    // Generate sendcount and displacement vectors for Scatterv
    for (i = 0; i < ntasks; i++) {
        // Send enought neighbors to make it possible to also calculate
        // blur in the edges of the chunk
        sendcnts[i] = buffsize + 2 * radius * xsize;
        displs[i] = max(0, i * buffsize);
    }

    clock_gettime(CLOCK_REALTIME, &tstime);

    // Send the image in chunks to all nodes
    MPI_Scatterv(src, sendcnts, displs,
                 pixel_mpi, recvbuff, buffsize + 2 * radius * xsize,
                 pixel_mpi, ROOT, MPI_COMM_WORLD);

    clock_gettime(CLOCK_REALTIME, &stime);

    // Run the filter on the recieved chunk
    blurfilter(xsize, (ysize / ntasks) + 2 * radius, recvbuff, radius, w, taskid);

    clock_gettime(CLOCK_REALTIME, &etime);
    printf("Filtering at %i took: %g secs\n", taskid, (etime.tv_sec  - stime.tv_sec) +
        1e-9*(etime.tv_nsec  - stime.tv_nsec));

    // Generate sendcount and displacement vectors for Scatterv
    for (i = 0; i < ntasks; i++) {
        result_write_starts[i] = i * buffsize + xsize * radius;
        // Only send as much of the chunk that is really useful data
        recievecounts[i] = buffsize;
    }

    // Start writing from the beginning of the buffer if root
    result_write_starts[0] = 0;

    // Since the root node has no overlap in the beginning, we need to
    // send a little bit more from that node than from the rest.
    recievecounts[0] = buffsize + xsize * radius;

    pixel* result_read_start;
    if(taskid==ROOT) {
        // Root-node has no duplicated data in the beginning
        result_read_start = recvbuff;
    } else {
        // Jump over the duplicated data in the beginning of each chunk
        result_read_start = recvbuff + xsize * radius;
    }

    MPI_Gatherv(result_read_start, recievecounts[taskid], pixel_mpi,
                src, recievecounts, result_write_starts,
                pixel_mpi, ROOT, MPI_COMM_WORLD);

    clock_gettime(CLOCK_REALTIME, &tetime);

    MPI_Finalize();


    /* write result */
    if (taskid == ROOT) {
        printf("Everything took: %g secs\n", (tetime.tv_sec  - tstime.tv_sec) +
           1e-9*(tetime.tv_nsec  - tstime.tv_nsec));


        printf("Writing output file\n");

        if(write_ppm (argv[3], xsize, ysize, (char *)src) != 0)
          exit(1);
    }

    return(0);
}
Exemple #4
0
/* Gather or scatter the global base array between processes.
 * NB: this is a collective operation.
 *
 * @scatter If true we scatter else we gather
 * @global_ary Global base array
 */
static void comm_gather_scatter(int scatter, bh_base *global_ary)
{
    bh_error err;
    bh_base *local_ary = array_get_local(global_ary);
    bh_intp totalsize = global_ary->nelem;

    if(totalsize <= 0)
        return;

    //Find the local size for all processes
    int sendcnts[pgrid_worldsize], displs[pgrid_worldsize];
    {
        bh_intp s = totalsize / pgrid_worldsize;//local size for all but the last process
        s *= bh_type_size(global_ary->type);
        for(int i=0; i<pgrid_worldsize; ++i)
        {
            sendcnts[i] = s;
            displs[i] = s * i;
        }
        //The last process gets the rest
        sendcnts[pgrid_worldsize-1] += totalsize % pgrid_worldsize * bh_type_size(global_ary->type);
    }

    int e;
    if(scatter)
    {
        //The slave-processes may need to allocate memory
        if(sendcnts[pgrid_myrank] > 0 && local_ary->data == NULL)
        {
            if((err = bh_data_malloc(local_ary)) != BH_SUCCESS)
                EXCEPT_OUT_OF_MEMORY();
        }
        //The master-process MUST have allocated memory already
        assert(pgrid_myrank != 0 || global_ary->data != NULL);

        //Scatter from master to slaves
        e = MPI_Scatterv(global_ary->data, sendcnts, displs, MPI_BYTE,
                         local_ary->data, sendcnts[pgrid_myrank], MPI_BYTE,
                         0, MPI_COMM_WORLD);
    }
    else
    {
        //Lets make sure that the 'local_ary' is updated
        batch_schedule_inst_on_base(BH_SYNC, local_ary);
        batch_flush();

        //The master-processes may need to allocate memory
        if(pgrid_myrank == 0 && global_ary->data == NULL)
        {
            if((err = bh_data_malloc(global_ary)) != BH_SUCCESS)
                EXCEPT_OUT_OF_MEMORY();
        }

        //We will always allocate the local array when gathering because
        //only the last process knows if the array has been initiated.
        if((err = bh_data_malloc(local_ary)) != BH_SUCCESS)
            EXCEPT_OUT_OF_MEMORY();

        assert(sendcnts[pgrid_myrank] == 0 || local_ary->data != NULL);

        //Gather from the slaves to the master
        e = MPI_Gatherv(local_ary->data, sendcnts[pgrid_myrank], MPI_BYTE,
                        global_ary->data, sendcnts, displs, MPI_BYTE,
                        0, MPI_COMM_WORLD);
    }
    if(e != MPI_SUCCESS)
        EXCEPT_MPI(e);
}
Exemple #5
0
int main(int argc, char *argv[]){
    int my_rank, procs, tag=0;
    uint64_t nodes = pow(2,SCALE);
    uint64_t edges = nodes*EDGEFACTOR;
    uint64_t root = ROOT;
        
    MPI_Status status;

    MPI_Init (&argc, &argv);
    MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size (MPI_COMM_WORLD, &procs); //SHOULD BE POWER OF TWO

    uint64_t *startVertex = NULL;
    uint64_t *endVertex = NULL;
    /* MUST BE INT BECAUSE OF MPI RESTRICTION */
    int *edgelist_send_counts = NULL;
    int *edgelist_send_displs = NULL;
    uint64_t *startVertex_recvbuf = NULL;
    uint64_t *endVertex_recvbuf = NULL;
    uint64_t *index_of_node = NULL;
    uint64_t *level = (uint64_t *) calloc(nodes / BITS, sizeof(uint64_t));
    int edgelist_counts_recvbuf = 0;
    if (my_rank == 0){
        startVertex = (uint64_t *) calloc(edges, I64_BYTES);
        endVertex = (uint64_t *) calloc(edges, I64_BYTES);
        edgelist_send_counts = (int *) calloc(procs, sizeof(int));
        edgelist_send_displs = (int *) calloc(procs, sizeof(int));

        read_graph(SCALE, EDGEFACTOR, startVertex, endVertex);
        
        double time = mytime();
        //SORTING THE EDGE LIST
        sort(startVertex, endVertex, 0, edges-1);
        
        //FINDING OUT THE BOUNDS OF THE EDGE LIST FOR EACH PROC
        int j;
        int last_node_number = 0;
        int core_count = 0;
        for (j = 0; j < procs; j++){
            last_node_number = nodes / procs * (j+1) - 1;
            core_count = (edges / procs * (j+1)) - 1;
            if (j < procs -1){
                while (startVertex[core_count] <= last_node_number) {
                    core_count++;
                }
                while (startVertex[core_count] > last_node_number){
                    core_count--;
                }
                if (j){
                    edgelist_send_counts[j] = core_count - edgelist_send_displs[j] + 1;
                }else{
                    edgelist_send_counts[j] = core_count + 1;
                }
                edgelist_send_displs[j+1] = core_count + 1;
            }else{
                edgelist_send_displs[0] = 0;
                edgelist_send_counts[j] = edges - edgelist_send_displs[j];
            }
        }
        
        MPI_Scatter((void *) edgelist_send_counts, 1, MPI_INT, &edgelist_counts_recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD);
        
        startVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES);
        endVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES);
        
        MPI_Scatterv((void *) startVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) startVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD);
        
        MPI_Scatterv((void *) endVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) endVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD);
        
        index_of_node = create_buffer_from_edgelist(startVertex_recvbuf, endVertex_recvbuf, nodes / procs, edgelist_counts_recvbuf, my_rank);
	
        //SET ROOT LEVEL
        level[(ROOT/BITS)] = level[(ROOT/BITS)] | (uint64_t) pow(2,(ROOT % BITS));

        //SCATTER LEVEL BUFFER
        MPI_Bcast((void *)level, nodes / BITS, MPI_UINT64_T, 0, MPI_COMM_WORLD);
        
        /*for (i = 0; i < index_of_node[(nodes/procs)]; i++){
            printf("%llu = %llu\n", (unsigned long long) buffer_recvbuf[i], (unsigned long long) startVertex_recvbuf[i]);
        }
        
        for (i = 0; i < nodes / procs; i++){
            printf("%llu = %llu\n", (unsigned long long) count_edges_per_node_recvbuf[i], (unsigned long long) index_of_node[i]);
        }*/
        
        //BFS
        time = mytime() - time;
        printf("Time for reading, generating edge buffer and scattering: %f\n", time/1000000);
        time = mytime();
        bfs(level, startVertex_recvbuf, index_of_node[nodes/procs], index_of_node, my_rank, procs);

        time = mytime() - time;
        printf("Time for bfs searching: %f\n", time/1000000);

        free(edgelist_send_counts);
        free(edgelist_send_displs);
        free(startVertex);
        free(endVertex);
    }else{
        MPI_Scatter((void *) edgelist_send_counts, 1, MPI_INT, &edgelist_counts_recvbuf, 1, MPI_INT, 0, MPI_COMM_WORLD);
        
        startVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES);
        endVertex_recvbuf = (uint64_t *) calloc(edgelist_counts_recvbuf, I64_BYTES);
        
        MPI_Scatterv((void *) startVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) startVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD);
        
        MPI_Scatterv((void *) endVertex, edgelist_send_counts, edgelist_send_displs, MPI_UINT64_T, (void *) endVertex_recvbuf, edgelist_counts_recvbuf, MPI_UINT64_T, 0, MPI_COMM_WORLD);
        
        index_of_node = create_buffer_from_edgelist(startVertex_recvbuf, endVertex_recvbuf, nodes / procs, edgelist_counts_recvbuf, my_rank);
        
        // GET THE FIRST LEVEL
        MPI_Bcast((void *)level, nodes / BITS, MPI_UINT64_T, 0, MPI_COMM_WORLD);
        
        bfs(level, startVertex_recvbuf, index_of_node[nodes/procs], index_of_node, my_rank, procs);
    }
    free(level);
    free(startVertex_recvbuf);
    free(endVertex_recvbuf);
    free(index_of_node);
   
    MPI_Finalize ();
    return 0;
}
Exemple #6
0
void IMB_scatterv(struct comm_info* c_info, int size, struct iter_schedule* ITERATIONS,
                 MODES RUN_MODE, double* time)
/*

                      
                      MPI-1 benchmark kernel
                      Benchmarks MPI_Scatterv
                      


Input variables: 

-c_info               (type struct comm_info*)                      
                      Collection of all base data for MPI;
                      see [1] for more information
                      

-size                 (type int)                      
                      Basic message size in bytes

-ITERATIONS           (type struct iter_schedule *)
                      Repetition scheduling

-RUN_MODE             (type MODES)                      
                      (only MPI-2 case: see [1])


Output variables: 

-time                 (type double*)                      
                      Timing result per sample


*/
{
  double t1, t2;
  int    i;
  Type_Size s_size,r_size;
  int s_num, r_num;

#ifdef CHECK
defect=0.;
#endif
  ierr = 0;

  /*  GET SIZE OF DATA TYPE */  
  MPI_Type_size(c_info->s_data_type,&s_size);
  MPI_Type_size(c_info->r_data_type,&r_size);
  if ((s_size!=0) && (r_size!=0))
    {
      s_num=size/s_size;
      r_num=size/r_size;
    } 

  /* INITIALIZATION OF DISPLACEMENT and RECEIVE COUNTS */

  for (i=0;i<c_info->num_procs ;i++)
    {
      c_info->sdispl[i] = s_num*i;
      c_info->sndcnt[i] = s_num;
    }

  
  if(c_info->rank!=-1)
    {
      for(i=0; i<N_BARR; i++) MPI_Barrier(c_info->communicator);

      t1 = MPI_Wtime();
      for(i=0;i<ITERATIONS->n_sample;i++)
      {
          ierr = MPI_Scatterv((char*)c_info->s_buffer+i%ITERATIONS->s_cache_iter*ITERATIONS->s_offs,
                              c_info->sndcnt,c_info->sdispl, c_info->s_data_type,
		              (char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs,
// root = round robin
                              r_num, c_info->r_data_type, i%c_info->num_procs,
                              c_info->communicator);
          MPI_ERRHAND(ierr);
          CHK_DIFF("Scatterv",c_info, 
                   (char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs,
                   c_info->sdispl[c_info->rank], size, size, 1, 
                   put, 0, ITERATIONS->n_sample, i,
                   i%c_info->num_procs, &defect);
        }
      t2 = MPI_Wtime();
      *time=(t2 - t1)/ITERATIONS->n_sample;
    }
  else
    { 
      *time = 0.; 
    }
}
/* Guassian Elimination algorithm using MPI */
void gaussElimination() {

    MPI_Status status;
    MPI_Request request;
    int row, col, i, norm;
    float multiplier;

    /* Array with the row size and number of rows that each processor will handle */
    int * first_row_A_array = (int*) malloc ( p * sizeof(int) );
    int * n_of_rows_A_array = (int*) malloc ( p * sizeof(int) );
    int * first_row_B_array = (int*) malloc ( p * sizeof(int) );
    int * n_of_rows_B_array = (int*) malloc ( p * sizeof(int) );
    for ( i = 0; i < p; i++ ) {
        first_row_A_array[i] = 0;
        n_of_rows_A_array[i] = 0;
        first_row_B_array[i] = 0;
        n_of_rows_B_array[i] = 0;
    }

    /* Main loop. After every iteration, a new column will have all 0 values down the [norm] index */
    for (norm = 0; norm < N-1; norm++) {

        /* --------------------------------------- */
        /*  Broadcasting of common values          */
        /*  -------------------------------------- */
        /* Broadcast the A[norm] row and B[norm], important values of this iteration */
        MPI_Bcast( &A[ N*norm ], N, MPI_FLOAT, SOURCE, MPI_COMM_WORLD );
        MPI_Bcast( &B[norm], 1, MPI_FLOAT, SOURCE, MPI_COMM_WORLD );

        /* ---------------------------------------   */
        /*  Calculation of number of rows to operate */
        /*  --------------------------------------   */
        /* subset of rows of this iteration */
        int subset = N - 1 - norm;
        /* number that indicates the step as a float */
        float step = ((float)subset ) / (p);
        /* First and last rows that this process will work into for this iteration */
        int first_row = norm + 1 + ceil( step * (my_rank) );
        int last_row = norm + 1 + floor( step * (my_rank+1) );
        if ( last_row >= N ) last_row = N-1;
        int number_of_rows = last_row - first_row +1;

        /*printf("\nProcess number %d of %d says in iteration %d that a=%d, b=%d and n=%d\n",
                            my_rank+1, p, norm+1,first_row,last_row,number_of_rows) ;*/



        /* --------------------------------------- */
        /*  Send data from process 0 to others     */
        /*  -------------------------------------- */
        if ( my_rank == SOURCE ) {

            for ( i = 1; i < p; i++ ) {

                /* We send to each process the amount of data that they are going to handle */
                int first_row_rmte = norm + 1 + ceil( step * (i) );
                int last_row_rmte = norm + 1 + floor( step * (i+1) );
                if( last_row_rmte >= N ) last_row_rmte = N -1;
                int number_of_rows_rmte = last_row_rmte - first_row_rmte +1;

                /* In case this process isn't assigned any task, continue. This happens when there are more processors than rows */
                //if( number_of_rows_rmte < 1 || first_row_rmte >= N ) continue;

                if ( number_of_rows_rmte < 0 ) number_of_rows_rmte = 0;
                if ( first_row_rmte >= N ) { number_of_rows_rmte = 0; first_row_rmte = N-1; };

                first_row_A_array[i] = first_row_rmte * N;
                first_row_B_array[i] = first_row_rmte;
                n_of_rows_A_array[i] = number_of_rows_rmte * N;
                n_of_rows_B_array[i] = number_of_rows_rmte ;

                //MPI_Isend( &A[first_row_rmte * N], N * number_of_rows_rmte, MPI_FLOAT, i,0, MPI_COMM_WORLD, &request);
                //MPI_Isend( &B[first_row_rmte],         number_of_rows_rmte, MPI_FLOAT, i,0, MPI_COMM_WORLD, &request);

            }
            
        }
        /* Receiver side */
       /* else {

            if ( number_of_rows > 0  && first_row < N) {

                //MPI_Recv( &A[first_row * N], N * number_of_rows, MPI_FLOAT, SOURCE, 0, MPI_COMM_WORLD, &status);
                //MPI_Recv( &B[first_row],         number_of_rows, MPI_FLOAT, SOURCE, 0, MPI_COMM_WORLD, &status);
            }
        }*/

        MPI_Scatterv(
            &A[0],              // send buffer
            n_of_rows_A_array,  // array with number of elements in each chunk
            first_row_A_array,  // array with pointers to initial element of each chunk
            MPI_FLOAT,          // type of elements to send
            &A[first_row * N],  // receive buffer
            N * number_of_rows, // number of elements to receive
            MPI_FLOAT,          // type of elements to receive
            SOURCE,             // who sends
            MPI_COMM_WORLD       
        );
        MPI_Scatterv(
            &B[0],
            n_of_rows_B_array,
            first_row_B_array,
            MPI_FLOAT,
            &B[first_row],
            number_of_rows,
            MPI_FLOAT,
            SOURCE,
            MPI_COMM_WORLD
        );   

        
        /*printf("\nProcess %d: Iteration number %d of %d\n",
                    my_rank, norm+1, N-1);
        print_A();*/



        /* --------------------------------------- */
        /*  Gaussian elimination                   */
        /*  The arrays only have the needed values */
        /*  -------------------------------------- */

        if ( number_of_rows > 0  && first_row < N) {  
            /* Similar code than in the sequential case */
            for (row = first_row; row <= last_row; row++) {

                multiplier = A[N*row + norm] / A[norm + N*norm];
                for (col = norm; col < N; col++) {
                    A[col+N*row] -= A[N*norm + col] * multiplier;
                }

                B[row] -= B[norm] * multiplier;
            }
        }


        /* --------------------------------------- */
        /*  Send back the results                  */
        /*  -------------------------------------- */
        /* Sender side */

        if ( my_rank != SOURCE ) {
            if ( number_of_rows > 0  && first_row < N) {
                MPI_Isend( &A[first_row * N], N * number_of_rows, MPI_FLOAT, SOURCE,0, MPI_COMM_WORLD, &request);
                MPI_Isend( &B[first_row],         number_of_rows, MPI_FLOAT, SOURCE,0, MPI_COMM_WORLD, &request);
            }
        }
        /* Receiver side */
        else {

            for ( i = 1; i < p; i++ ) {

                // In case this process isn't assigned any task, continue. This happens when there are more processors than rows 
                if( n_of_rows_B_array[i] < 1  || first_row_B_array[i] >= N) continue;

                MPI_Recv( &A[ first_row_A_array[i] ], n_of_rows_A_array[i] , MPI_FLOAT, i,0, MPI_COMM_WORLD, &status );
                MPI_Recv( &B[ first_row_B_array[i] ], n_of_rows_B_array[i] , MPI_FLOAT, i,0, MPI_COMM_WORLD, &status );
            }

            
        }
        /*
        MPI_Gatherv(
            &A[first_row * N],       // send buffer
            N * number_of_rows,      // number of elements to send
            MPI_FLOAT,               // type of elements to send
            &A[0],                   // receive buffer
            n_of_rows_A_array,       // array with number of elements in each chunk
            first_row_A_array,       // array with pointers to initial element of each chunk, in the reception buffer
            MPI_FLOAT,               // type of elements to receive
            SOURCE,                  // who receives
            MPI_COMM_WORLD
        );

        MPI_Gatherv(
            &B[first_row],
            number_of_rows,
            MPI_FLOAT,
            &B[0], 
            n_of_rows_B_array,
            first_row_B_array,
            MPI_FLOAT,
            SOURCE,
            MPI_COMM_WORLD
        );
    */
    }
}
Exemple #8
0
int main(int argc, char *argv[])
{

    int m,n,c,iters;
    int my_m, my_n, my_rank, num_procs, recv_count, my_recv_count, block_size, smallest_block_size;

    float kappa;
    image u, u_bar;
    unsigned char *image_chars, *my_image_chars, *new_image_chars, *my_new_image_chars;
    char *input_jpeg_filename, *output_jpeg_filename;
    my_rank = 0;

    char * kappa_str;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
    int displs[num_procs],recvDispls[num_procs],sendCounts[num_procs], recvCounts[num_procs];
    int i,my_m_rest;
    /*
     *read from command line: kappa,iters,input_jpeg_filename,output_jpeg_filename;
     */
    input_jpeg_filename = argv[1];//riktig char
    output_jpeg_filename = argv[2];//riktig char
    kappa_str = (argv[3]);//maa konvertere til double
    iters = atoi(argv[4]);//maa konvertere til int
    //printf("iters: %d\n",iters);
    kappa = 0.01;//TODO:fix so that kappa can be read from the command line
    kappa = atof(kappa_str);


    if(my_rank==0){
        import_JPEG_file(input_jpeg_filename, &image_chars, &m, &n, &c);
    }

    /////////////////////////////////////////////////////////////////
    //Broadcasts the size from root(=0) to all the other processes.//
    /////////////////////////////////////////////////////////////////
    MPI_Bcast(&m,1,MPI_INT,0,MPI_COMM_WORLD);
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
    /*
     *divide the m x n pixels evenly among the MPI processes
     */
    my_n = n;//this is correct
    my_m = (m-2)/num_procs;//without ghost points
    my_m_rest = (m-2)%num_procs;
    smallest_block_size = my_m*my_n;
    if(my_rank<my_m_rest){
        my_m+=1;
    }
    printf("my_m: %d\n", my_m);
    block_size = my_m*n;
    /////////////////////////////////////////////////////////////////////////
    //the last process get a larger my_m if m/num_procs is a decimal number//
    /////////////////////////////////////////////////////////////////////////
//    if(my_rank==num_procs-1){
//        my_m = my_m + (m-2)%num_procs;
//    }
    my_recv_count = my_m*my_n;


    /////////////////////////////////////////////////////
    //this is the picture divided into two processes.
    // n-->
    // ----------------------- m
    // |                     | |
    // |          0          | v
    // -----------------------
    // |                     |
    // |          1          |
    // -----------------------
    ///////////////////////////////////////////////////////
    allocate_image(&u, my_m, my_n);
    allocate_image(&u_bar, my_m, my_n);
    my_image_chars = malloc((block_size+2*n)*(sizeof(int)));


    if(my_rank==0){
        int last_displ=0;
        int current_block_size;
        for(i=0;i<my_m_rest;i++){
            current_block_size = smallest_block_size + n;
            sendCounts[i] = current_block_size + 2*n;
            recvCounts[i] = current_block_size;
            displs[i] = current_block_size*i;
            recvDispls[i] = 0;
            //printf("sendCounts: %d\n", sendCounts[i]);
            printf("displ: %d\n",displs[i]/n);
            last_displ = displs[i];
        }
        printf("rest: %d\n", my_m_rest);
        for(i=my_m_rest;i<num_procs;i++){
            printf("%d\n",i);
            current_block_size = smallest_block_size;
             printf("%d\n", current_block_size);
            sendCounts[i] = current_block_size+2*n;
            recvCounts[i] = current_block_size;
            if(i==0){
                displs[i] = 0;
            }else{
                displs[i] = displs[i-1] + current_block_size;
            }

            recvDispls[i] = 0;
            //printf("sendCounts: %d\n", sendCounts[i]);
            printf("displ: %d\n",  displs[i]/n);
        }
    }

    /*
     *each process asks process 0 for partitiones region
     *of image_chars and copy the values into u
    */
    //MPI_Scatterv(image_chars, sendCounts, displs, MPI_CHAR, my_image_chars, recv_count, MPI_CHAR, 0, MPI_COMM_WORLD);
    //MPI_Scatter(&image_chars, my_m*my_n,MPI_CHAR, &my_image_chars, my_m*my_n, MPI_CHAR, 0,MPI_COMM_WORLD);//assume first that there will be no extra rows
    //MPI_Scatter(image_chars, block_size, MPI_CHAR, my_image_chars, block_size, MPI_CHAR, 0, MPI_COMM_WORLD);
    MPI_Scatterv(image_chars, sendCounts, displs, MPI_CHAR, my_image_chars, block_size+2*n, MPI_CHAR, 0, MPI_COMM_WORLD);



    int start = 0;
    convert_char_to_float(my_image_chars, &u,my_m+2, my_n,start);
    //printf("%f", kappa);
    iso_diffusion_denoising(&u, &u_bar, kappa, iters);

    /*
     *each process sends its resulting content of u_bar to process 0
     *process 0 receives from each process incoming vaules and
     *copy them into the designated region of image_chars
     */

    //convert_float_to_char(&image_chars,&u,my_m, my_n,start);
        int x,y, pict_number,value;
        for(x=0;x<my_m+2;x++){
            for(y=0;y<my_n;y++){
                pict_number = x*n + y;
                value = (int)(u.image_data[x][y]);
                my_image_chars[pict_number] = (unsigned char) value;
            }
        }

        //MPI_Gather(my_image_chars, block_size, MPI_CHAR, image_chars, block_size, MPI_CHAR, 0,MPI_COMM_WORLD);
        //MPI_Gatherv(my_image_chars, block_size, MPI_CHAR, image_chars,recvCounts, displs, MPI_CHAR,0, MPI_COMM_WORLD);
        //MPI_Gatherv(my_image_chars, block_size+2*n, MPI_CHAR, image_chars, sendCounts, displs, MPI_CHAR, 0,MPI_COMM_WORLD);
        MPI_Send(my_image_chars,block_size+2*n, MPI_CHAR, 0,0, MPI_COMM_WORLD);
   int k,p;
    if(my_rank == 0){
        //receive the computed my_image_chars from all processes
        my_new_image_chars = malloc(block_size*sizeof(int));
        new_image_chars = malloc(n*m*sizeof(int));
        for(i=0;i<n*m;i++){
            new_image_chars[i] = 0;
        }
        for(i=0;i<num_procs;i++){
            MPI_Recv(my_new_image_chars,sendCounts[i], MPI_CHAR,i,0,MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            start = displs[i];//i*(sendCounts[i]-2*n);
            for(k=0;k<sendCounts[i];k++){
                new_image_chars[start + k]= my_new_image_chars[k];
            }

        }

        export_JPEG_file(output_jpeg_filename, new_image_chars,m,n,c,75);
    }

    deallocate_image(&u);
    deallocate_image(&u_bar);
    //printf("Hello World!\n");
    MPI_Finalize();
    return 0;
}
Exemple #9
0
int main(int argc, char **argv)
{
    int rank, size, myrow, mycol, nx, ny, stride, cnt, i, j, errs, errs_in_place;
    double *sendbuf, *recvbuf;
    MPI_Datatype vec, block, types[2];
    MPI_Aint displs[2];
    int *scdispls;
    int blens[2];
    MPI_Comm comm2d;
    int dims[2], periods[2], coords[2], lcoords[2];
    int *sendcounts;


    MTest_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    /* Get a 2-d decomposition of the processes */
    dims[0] = 0;
    dims[1] = 0;
    MPI_Dims_create(size, 2, dims);
    periods[0] = 0;
    periods[1] = 0;
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm2d);
    MPI_Cart_get(comm2d, 2, dims, periods, coords);
    myrow = coords[0];
    mycol = coords[1];
/*
    if (rank == 0)
        printf("Decomposition is [%d x %d]\n", dims[0], dims[1]);
*/

    /* Get the size of the matrix */
    nx = 10;
    ny = 8;
    stride = nx * dims[0];

    recvbuf = (double *) malloc(nx * ny * sizeof(double));
    if (!recvbuf) {
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    sendbuf = 0;
    if (myrow == 0 && mycol == 0) {
        sendbuf = (double *) malloc(nx * ny * size * sizeof(double));
        if (!sendbuf) {
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
    }
    sendcounts = (int *) malloc(size * sizeof(int));
    scdispls = (int *) malloc(size * sizeof(int));

    MPI_Type_vector(ny, nx, stride, MPI_DOUBLE, &vec);
    blens[0] = 1;
    blens[1] = 1;
    types[0] = vec;
    types[1] = MPI_UB;
    displs[0] = 0;
    displs[1] = nx * sizeof(double);

    MPI_Type_struct(2, blens, displs, types, &block);
    MPI_Type_free(&vec);
    MPI_Type_commit(&block);

    /* Set up the transfer */
    cnt = 0;
    for (i = 0; i < dims[1]; i++) {
        for (j = 0; j < dims[0]; j++) {
            sendcounts[cnt] = 1;
            /* Using Cart_coords makes sure that ranks (used by
             * sendrecv) matches the cartesian coordinates (used to
             * set data in the matrix) */
            MPI_Cart_coords(comm2d, cnt, 2, lcoords);
            scdispls[cnt++] = lcoords[0] + lcoords[1] * (dims[0] * ny);
        }
    }

    SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]);
    MPI_Scatterv(sendbuf, sendcounts, scdispls, block, recvbuf, nx * ny, MPI_DOUBLE, 0, comm2d);
    if ((errs = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], 0))) {
        fprintf(stdout, "Failed to transfer data\n");
    }

    /* once more, but this time passing MPI_IN_PLACE for the root */
    SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]);
    MPI_Scatterv(sendbuf, sendcounts, scdispls, block,
                 (rank == 0 ? MPI_IN_PLACE : recvbuf), nx * ny, MPI_DOUBLE, 0, comm2d);
    errs_in_place = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], (rank == 0));
    if (errs_in_place) {
        fprintf(stdout, "Failed to transfer data (MPI_IN_PLACE)\n");
    }

    errs += errs_in_place;

    if (sendbuf)
        free(sendbuf);
    free(recvbuf);
    free(sendcounts);
    free(scdispls);
    MPI_Type_free(&block);
    MPI_Comm_free(&comm2d);
    MTest_Finalize(errs);
    return MTestReturnValue(errs);
}
int main (int argc, char **argv) {
	FILE *fp;
	double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL;
	double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL;
	int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size;
	int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size;
	int rank, size, sqrt_size, matrices_a_b_dimensions[4];
	MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator;
	MPI_Status status;

	// used to manage the cartesian grid
	int dimensions[2], periods[2], coordinates[2], remain_dims[2];

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* For square mesh */
	sqrt_size = (int)sqrt((double) size);
	if(sqrt_size * sqrt_size != size){
		if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n");
		MPI_Abort(MPI_COMM_WORLD, -1);
	}

	// create a 2D cartesian grid
	dimensions[0] = dimensions[1] = sqrt_size;
	periods[0] = periods[1] = 1;
	MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator);
	MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); //v COORDINATES imas shranjene koordinate procesa RANK

	// create a row communicator
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator);

	// create a column communicator
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator);

	// getting matrices from files at rank 0 only
	// example: mpiexec -n 64 ./cannon matrix1 matrix2 [test]
	if (rank == 0){
		int row, column;
		if ((fp = fopen (argv[1], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]);
			A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *));
			for (row = 0; row < matrices_a_b_dimensions[0]; row++){
				A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double));
				for (column = 0; column < matrices_a_b_dimensions[1]; column++)
					fscanf(fp, "%lf", &A[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
		if((fp = fopen (argv[2], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]);
			B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *));
			for(row = 0; row < matrices_a_b_dimensions[2]; row++){
				B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *));
				for(column = 0; column < matrices_a_b_dimensions[3]; column++)
					fscanf(fp, "%lf", &B[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// need to check that the multiplication is possible given dimensions
		// matrices_a_b_dimensions[0] = row size of A
		// matrices_a_b_dimensions[1] = column size of A
		// matrices_a_b_dimensions[2] = row size of B
		// matrices_a_b_dimensions[3] = column size of B
		if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){
			if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n",
					matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// this implementation is limited to cases where thematrices can be partitioned perfectly
		if( matrices_a_b_dimensions[0] % sqrt_size != 0
				|| matrices_a_b_dimensions[1] % sqrt_size != 0
				|| matrices_a_b_dimensions[2] % sqrt_size != 0
				|| matrices_a_b_dimensions[3] % sqrt_size != 0 ){
			if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n"
					"all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n",
					matrices_a_b_dimensions[0],matrices_a_b_dimensions[1],
					matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size );
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
	}

	// send dimensions to all peers
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	/*if(rank == 0) {
		int i;
		for(i = 1; i < size; i++){
			MPI_Send(matrices_a_b_dimensions, 4, MPI_INT, i, 0, cartesian_grid_communicator);
		}
	} else {
		MPI_Recv(matrices_a_b_dimensions, 4, MPI_INT, 0, 0, cartesian_grid_communicator, &status);
	}*/
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	//has to be blocking, bcs it is used right afterwards...
	MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator);
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


	A_rows = matrices_a_b_dimensions[0];
	A_columns = matrices_a_b_dimensions[1];
	B_rows = matrices_a_b_dimensions[2];
	B_columns = matrices_a_b_dimensions[3];

	// local metadata for A
	A_local_block_rows = A_rows / sqrt_size;
	A_local_block_columns = A_columns / sqrt_size;
	A_local_block_size = A_local_block_rows * A_local_block_columns;
	A_local_block = (double *) malloc (A_local_block_size * sizeof(double));

	// local metadata for B
	B_local_block_rows = B_rows / sqrt_size;
	B_local_block_columns = B_columns / sqrt_size;
	B_local_block_size = B_local_block_rows * B_local_block_columns;
	B_local_block = (double *) malloc (B_local_block_size * sizeof(double));

	// local metadata for C
	C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double));
	// C needs to be initialized at 0 (accumulates partial dot-products)
	int i;
	for(i=0; i < A_local_block_rows * B_local_block_columns; i++){
		C_local_block[i] = 0;
	}

	// full arrays only needed at root
	if(rank == 0){
		A_array = (double *) malloc(sizeof(double) * A_rows * A_columns);
		B_array = (double *) malloc(sizeof(double) * B_rows * B_columns);
		C_array = (double *) malloc(sizeof(double) * A_rows * B_columns);
		// generate the 1D arrays of the matrices at root
		int row, column, i, j;
		for (i = 0; i < sqrt_size; i++){
			for (j = 0; j < sqrt_size; j++){
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < A_local_block_columns; column++){
						A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column]
							= A[i * A_local_block_rows + row][j * A_local_block_columns + column];
					}
				}
				for (row = 0; row < B_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column]
							= B[i * B_local_block_rows + row][j * B_local_block_columns + column];
					}
				}
			}
		}
		// allocate output matrix C
		C = (double **) malloc(A_rows * sizeof(double *));
		for(i=0; i<A_rows ;i++){
			C[i] = (double *) malloc(B_columns * sizeof(double));
		}
	}

	// send a block to each process
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	/*if(rank == 0) {
		int i;
		for(i = 1; i < size; i++){
			MPI_Send((A_array + (i * A_local_block_size)), A_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator);
			MPI_Send((B_array + (i * B_local_block_size)), B_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator);

		}
		for(i = 0; i < A_local_block_size; i++){
			A_local_block[i] = A_array[i];
		}
		for(i = 0; i < B_local_block_size; i++){
			B_local_block[i] = B_array[i];
		}
	} else {
		MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status);
		MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status);
	}*/
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	/*MPI_Scatter(A_array,
    A_local_block_size, //int send_count,
    MPI_DOUBLE,
    A_local_block,
    A_local_block_size,
    MPI_DOUBLE,
    0,
    cartesian_grid_communicator);*/
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


	// fix initial arrangements before the core algorithm starts - fora je, da se preden se prvic zacne computational part of algo, moras ze bloke zamenjat...
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	/*if(coordinates[0] != 0){
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE,
				(coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0,
				(coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status);
	}
	if(coordinates[1] != 0){
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE,
				(coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0,
				(coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status);
	}*/
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	// two independent scattervs one after another, so can be non-blocking, but barrier needed right afterwards, since data is than used...
	int *displsA[size];
	int *displsB[size];
	int *localblsizA[size];
	int *localblsizB[size];
	MPI_Request requests[2];
	MPI_Status statuses[2];
	for (int i=0; i<sqrt_size; i++){
		for (int j=0; j<sqrt_size; j++){
			displsA[i*sqrt_size + j] = (i*sqrt_size + (j+i)%sqrt_size)*A_local_block_size;
			displsB[i*sqrt_size + j] = (j + ((j+i)%size)*sqrt_size)*B_local_block_size;
			localblsizA[i*sqrt_size+j] = A_local_block_size;
			localblsizB[i*sqrt_size+j] = B_local_block_size;
		}
	}
	MPI_IScatterv(A_array, localblsizA, displsA,
                 MPI_DOUBLE, A_local_block, A_local_block_size,
                 MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[0]);
  MPI_IScatterv(B_array, localblsizB, displsB,
                 MPI_DOUBLE, B_local_block, B_local_block_size,
                 MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[1]);
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	//tega initial realignmenta ne bo treba, če boš že v A_array na zacetku alignano napisal data... ker te bo dovolj zgolj scatter.
	//Je pa Isaias tudi reku, da lahko to pustima in samo povema, da to pac ne gre prepisat, ker je if stavek in ne sodelujejo vsi ranki;
	//    	pogoj za collective je pa ravno to, da sodelujejo vsi!
	//
	//Je pa še ena moznost: scatterv! pa das primerne displacemente!
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	// cannon's algorithm
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%555ta del je za SCATTER + GATHER
 	  int *dispA[sqrt_size];
	  int *dispB[sqrt_size];
		int *localsizesA[sqrt_size];
		int *localsizesB[sqrt_size];

		if (coordinates[0]==0) {
			double *B_rowarray[sqrt_size*B_local_block];
		}
		if (coordinates[1]==0){
			double *A_rowarray[sqrt_size*A_local_block];
		}

		for (int i=0; i<sqrt_size; i++){
			dispA[i] = ((i+1)%sqrt_size)*A_local_block_size;
			dispB[i] = ((i+1)%sqrt_size)*B_local_block_size;
			localsizesA[i] = A_local_block_size;
			localsizesB[i] = B_local_block_size;
    }
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%555
	int cannon_block_cycle;
	double compute_time = 0, mpi_time = 0, start;
	int C_index, A_row, A_column, B_column;

	MPI_Waitall(2, requests, statuses);
	for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){
		// compute partial result for this block cycle
		start = MPI_Wtime();
		for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){
			for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){
				for(A_column = 0; A_column < A_local_block_columns; A_column++){
					C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] *
																		B_local_block[A_column * B_local_block_columns + B_column];
				}
			}
		}
		compute_time += MPI_Wtime() - start;
		//start = MPI_Wtime();
		// rotate blocks horizontally
		/*MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, //to bi slo z MPI_alltoallv, in tisto variablo za replacing. ampak bi blo inefficient - glej komentarje!
				(coordinates[1] + sqrt_size - 1) % sqrt_size, 0,
				(coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status);
		// rotate blocks vertically
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE,
				(coordinates[0] + sqrt_size - 1) % sqrt_size, 0,
				(coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status);
		mpi_time += MPI_Wtime() - start; */



		//ce uporabis sendrecv, imas skupno v vsaki vrsti/stolpu sqrt_size komunikacij  (oz SQRT_SIZE posiljanj + SQRT_SIZE prejemanj),,
		//ce bi dala alltoall pa bi jih (kljub temu, da bi ponekod posiljal bloke velikosti 0) imel SIZE. Kar je pa tut ful inefficient(glej komentarje):
		/*This is allowed by the standard, but be warned that it is likely to perform
		poorly compared to what could be done with point-to-point or one-sided
		operations if most links are empty. ! ! ! ! ! ! ! ! */
		//lahko pa nardis to z gather+scatter, pa po en rank v vsaki vrsti/stolpu gathera vse, in jih zashiftano nazaj poslje. But that would still mean
		// 2*SQRT_SIZE communications, and it would have to be blocking, since data is used right afterwards. It might sound good to do this because even though
		//you need same amount of communication, the collectives are optimized; so the sole communcation should in this case take less time. However it's probably
		//not that big of a difference... lahko pa vseeno probata?
		//An even better idea seems to be if you figure out the pattern in which the blocks are shifted, and only use the A_array to scatter it from rnk 0
		//in the right order to all other ranks... This way we would all in all need SIZE communications (rnk 0 with everyone else) while with the previous
		//way we would all together need num_rows/colums*2*SQRT_SIZE, which is twice more. However in this last way, we would also need to compute the right
		//indeces for scatter everytime?
		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% SCATTER + GATHER
		//rabis se rezervacijo prostora, za tiste ranke k si shranjo vse bloke neke vrste/stolpa
		start = MPI_Wtime();

		MPI_Gather(A_local_block, A_local_block_size, MPI_DOUBLE,
    					 A_rowarray, A_local_block_size, MPI_DOUBLE, coordinates[0]*sqrt_size, row_communicator);
		MPI_Gather(B_local_block, B_local_block_size, MPI_DOUBLE,
    					 B_rowarray, B_local_block_size, MPI_DOUBLE, coordinates[1], column_communicator);

		MPI_Scatterv(A_rowarray, localsizesA, dispA,
	                 MPI_DOUBLE, A_local_block, A_local_block_size,
	                 MPI_DOUBLE, coordinates[0]*sqrt_size, row_communicator);
		MPI_Scatterv(B_rowarray, localsizesB, dispB,
	                 MPI_DOUBLE, B_local_block, B_local_block_size,
	                 MPI_DOUBLE, coordinates[1], column_communicator);

 	  mpi_time += MPI_Wtime() - start;
		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% SCATTER from origin
		start = MPI_Wtime(); //al bi se to moglo merit sele spodej, za zankami?

		for (int i=0; i<sqrt_size-1; i++){
			for (int j=0; j<sqrt_size-1; j++){
				displsA[i*sqrt_size + j] += A_local_block_size;
				displsB[i*sqrt_size + j] += B_local_block_size*size_sqrt;
			}
		}
		for (int i=0; i<sqrt_size; i++){
			displsA[size - sqrt_size + i] -= A_local_block_size*(sqrt_size-1);
			displsB[size - sqrt_size + i] -= B_local_block_size*(sqrt_size-1)*size_sqrt;
		}
		MPI_Scatterv(A_array, localblsizA, displsA,
	                 MPI_DOUBLE, A_local_block, A_local_block_size,
	                 MPI_DOUBLE, 0, cartesian_grid_communicator);
  	MPI_Scatterv(B_array, localblsizB, displsB,
 	                 MPI_DOUBLE, B_local_block, B_local_block_size,
 	                 MPI_DOUBLE, 0, cartesian_grid_communicator);

		mpi_time += MPI_Wtime() - start;
		//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	}


	// get C parts from other processes at rank 0
	/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	if(rank == 0) {
		for(i = 0; i < A_local_block_rows * B_local_block_columns; i++){
			C_array[i] = C_local_block[i];
		}
		int i;
		for(i = 1; i < size; i++){
			MPI_Recv(C_array + (i * A_local_block_rows * B_local_block_columns), A_local_block_rows * B_local_block_columns,
				MPI_DOUBLE, i, 0, cartesian_grid_communicator, &status);
		}
	} else {
		MPI_Send(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, 0, cartesian_grid_communicator);
	}*/
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
               		 C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
                   0, cartesian_grid_communicator);  //blocking, ker gres takoj nekaj delat s tem pol... right?
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	// generating output at rank 0
	if (rank == 0) {
		// convert the ID array into the actual C matrix
		int i, j, k, row, column;
		for (i = 0; i < sqrt_size; i++){  // block row index
			for (j = 0; j < sqrt_size; j++){ // block column index
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						C[i * A_local_block_rows + row] [j * B_local_block_columns + column] =
							C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns)
							+ (row * B_local_block_columns) + column];
					}
				}
			}
		}

		printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns);
		printf("Computation time: %lf\n", compute_time);
		printf("MPI time:         %lf\n", mpi_time);

		if (argc == 4){
			// present results on the screen
			printf("\nA( %d x %d ):\n", A_rows, A_columns);
			for(row = 0; row < A_rows; row++) {
				for(column = 0; column < A_columns; column++)
					printf ("%7.3f ", A[row][column]);
				printf ("\n");
			}
			printf("\nB( %d x %d ):\n", B_rows, B_columns);
			for(row = 0; row < B_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ", B[row][column]);
				printf("\n");
			}
			printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns);
			for(row = 0; row < A_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ",C[row][column]);
				printf("\n");
			}


			printf("\nPerforming serial consistency check. Be patient...\n");
			fflush(stdout);
			int pass = 1;
			double temp;
			for(i=0; i<A_rows; i++){
				for(j=0; j<B_columns; j++){
					temp = 0;
					for(k=0; k<B_rows; k++){
						temp += A[i][k] * B[k][j];
					}
					printf("%7.3f ", temp);
					if(temp != C[i][j]){
						pass = 0;
					}
				}
				printf("\n");
			}
			if (pass) printf("Consistency check: PASS\n");
			else printf("Consistency check: FAIL\n");
		}
	}

	// free all memory
	if(rank == 0){
		int i;
		for(i = 0; i < A_rows; i++){
			free(A[i]);
		}
		for(i = 0; i < B_rows; i++){
			free(B[i]);
		}
		for(i = 0; i < A_rows; i++){
			free(C[i]);
		}
		free(A);
		free(B);
		free(C);
		free(A_array);
		free(B_array);
		free(C_array);
	}
	free(A_local_block);
	free(B_local_block);
	free(C_local_block);

	// finalize MPI
	MPI_Finalize();
}
Exemple #11
0
bool scatter(){
	int i, j;
	int count;
	int count_tot;
	int* count_root;
	int* displ;

	MPI_Bcast(&idx, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&idy, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&idz, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&iddx, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&iddy, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&iddz, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&qch, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&dV, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&dAdrop, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&dApart, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(&droplet, 1, MPI_INT, root, MPI_COMM_WORLD);
	MPI_Bcast(&length, 1, MPI_INT, root, MPI_COMM_WORLD);
	MPI_Bcast(&AnchNInf, 1, MPI_BYTE, root, MPI_COMM_WORLD);

	MPI_Barrier(MPI_COMM_WORLD);
	//define shared window and store Qold of root processor to q for all processors to access
        MPI_Win_allocate_shared(6 * length * sizeof(float), 1, MPI_INFO_NULL, shmcomm, &q, &win);
        MPI_Scatter(Qold, 6 * length, MPI_FLOAT, q, 6 * length, MPI_FLOAT, root, MPI_COMM_WORLD);
	
	//populate share at root processor to sign at all processors
	sign = (int*)malloc(length * sizeof(int));
	for(i = 0; i < length; i ++)	sign[i] = -1;
        MPI_Scatter(share, length, MPI_INT, sign, length, MPI_INT, root, MPI_COMM_WORLD);
	
	//Allocate Qnew(qn)
	qn = (float*)malloc(6 * length * sizeof(float));
	for(i = 0; i < 6 * length; i ++)	qn[i] = q[i];	

	//populate neighbor at root processor to neigb at all processors
	neigb = (int*)malloc(6 * length * sizeof(int));
        MPI_Scatter(neighbor, 6 * length, MPI_INT, neigb, 6 * length, MPI_INT, root, MPI_COMM_WORLD);
	
	//Adjust the index for different processors to access q in shared window
	for(i = 0; i < 6 * length; i ++){
		neigb[i] -= length * myid;
	}
//	printf("%d:\t%d\t%d\t%d\t%d\t%d\t%d.\n", myid, neigb[0], neigb[1], neigb[2], neigb[3], neigb[4], neigb[5]);
	
	//Verify the number of droplet and boundary. If not consistent, report error.
	count = 0;
	for(i = 0; i < length; i ++){
		if(sign[i] >= 0 && sign[i] < 10)	count ++;
	}

 	MPI_Reduce(&count, &count_tot, 1, MPI_INT, MPI_SUM, root, MPI_COMM_WORLD);
	if(myid == root && count_tot != droplet){
		printf("Error in scatter. Counted number %d is not equal to droplet %d.\n", count_tot, droplet);	
		return false;
	}
	count = 0;
	for(i = 0; i < length; i ++){
		if(sign[i] >= 2 && sign[i] < 10)	count ++;
	}
 	MPI_Reduce(&count, &count_tot, 1, MPI_INT, MPI_SUM, root, MPI_COMM_WORLD);
	if(myid == root && count_tot != surf){
		printf("Error in scatter(boundary). Counted number %d is not equal to surface %d.\n", count_tot, surf);	
		return false;
	}
	count *= 3;

	nu_p = (double*)malloc(count * sizeof(double));
	count_root = (int*)malloc(numprocs * sizeof(int));
	displ = (int*)malloc(numprocs * sizeof(int));

//	if(myid == root) printf("Check3.\n");
//	scatter nu and qo defined at boundary nodes to different processors.
	MPI_Gather(&count, 1, MPI_INT, count_root, 1, MPI_INT, root, MPI_COMM_WORLD);		
	if(myid == root){
		for(i = 0; i < numprocs; i ++){
			displ[i] = 0;
			for(j = 0; j < i; j++){
				displ[i] += count_root[j];
			}
		}
	}
        MPI_Scatterv(nu, count_root, displ, MPI_DOUBLE, nu_p, count, MPI_DOUBLE, root, MPI_COMM_WORLD);

	
	if((degenerate == 0 && infinite == 0) || AnchNInf){
		count *= 2;
		if(myid == root){	
			for(i = 0; i < numprocs; i ++){
				count_root[i] *= 2;
				displ[i] *= 2;
			}
		}
		qo_p = (float*)malloc(count * sizeof(float));
		MPI_Scatterv(Qo, count_root, displ, MPI_FLOAT, qo_p, count, MPI_FLOAT, root, MPI_COMM_WORLD);
	}

//	printf("check4.\n");
	if(myid == root){
		free(neighbor);
		free(Qold);
		free(share);
		free(nu);
		if((degenerate == 0 && infinite == 0) || AnchNInf)	free(Qo);
	}
	free(count_root);
	free(displ);
	return true;

}
Exemple #12
0
int main(int argc, char** argv) {
	
	// Establece el tiempo inicial del programa
	clock_t t_start = clock();
	
	int rank;
	int numtasks;
	int i;
	int stride;
	
	int vector[MAX];
	
	for(i = 1; i <= 100; i++)
		vector[ i - 1 ] = i;
	

	MPI_Init(&argc,&argv);
	MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	
	stride = MAX/(numtasks);
	//printf("Stride: %d\n", stride);
	
	int vtmp[stride];
	int disp[stride];
	int sendcount[stride];
	int acum;
	
	
	for(i = 0; i < numtasks; i++) {
		disp[i] = i * stride;
		sendcount[i] = stride;
	}
	
	// &vector: desde donde se van a tomar los datos
	// sendcount: cuantos datos voy a enviar
	// disp: cuanto es el desplazamiento relativo a sendbuff, apartir del cual toma valores el proceso i
	// sendtype: el tipo de dato que voy a enviar
	// &recvbuff: donde se van almacenar los datos
	// recvcount: cuantos datos va a recibir
	// recvtype: el tipo de dato que va a recibir
	// root: quien origina la distribucion de los datos
	// comm: comunicador de procesos
	// MPI_Scatterv(&sendbuff, sendcount, sendtype, &recvbuff, recvcount, recvtype, root, comm)
	
	MPI_Scatterv(vector, sendcount, disp, MPI_INT, vtmp, stride, MPI_INT, 0, MPI_COMM_WORLD);
	
	acum = 0;
	for(i = 0; i < stride; i++) {
		acum += vtmp[i];
	}
	
	printf("Subtotal %d en nodo %d\n", acum, rank);
	
	MPI_Reduce(&acum, vtmp, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
	
	if(rank == 0) {
		printf("TOTAL: %d\n", vtmp[0]);
		// Establece el tiempo final del programa
		clock_t t_end = clock();
		// Tiempo de ejecucion del programa
		clock_t t_run = t_end - t_start;
		printf ("Tiempo de Ejecucion: (%f segundos).\n",((float)t_run)/CLOCKS_PER_SEC);
	}

	MPI_Finalize();
	
	
	return 0;
}
Exemple #13
0
int ORD::find_elim_ordering() {
    int ws;
    int wr;

    char eoname[512];
    char eoname_other[512];

    // Get size and rank from the communicator
    MPI_Comm_size(comm, &ws);
    MPI_Comm_rank(comm, &wr);

    double xtime = MPI_Wtime();
    sprintf(eoname, "%s.order.%d", this->filename.c_str(), ws);
    sprintf(eoname_other, "%s.order_other.%d", this->filename.c_str(), ws);

    DEBUG("size: %d, rank %d \n", ws, wr);
    int n = G->get_num_nodes();
    int x = n/ws;
    int xm = n%ws;
    int i = 0;
    DEBUG("n: %d x: %d xm: %d \n", n, x, xm);

    vector<int> xadj;
    vector<int> adjncy;

    vector<int> vtxdist(ws + 1, 0);
    vector<int> sizes(2*ws,0);
    vector<int> ordering(x+1, 0);
    vector<int> recvcnt(ws, 0);
    vector<int> displ(ws, 0);

    int numflag = 0;




    int options[10];

    options[0] = 0;
    vtxdist[0] = 0;
    for (i = 1; i <= ws; i++)
    {
        vtxdist[i] = vtxdist[i - 1] + x;
        if (i <= xm)
            vtxdist[i]++;
    }

    // prepareing displacement and receive counts to use with MPI_Gatherv
    for (i = 0; i < ws; i++)
    {
        recvcnt[i] = x;
        if (i < xm)
            recvcnt[i] ++;

        if (i > 0)
            displ[i] += displ[i - 1] + recvcnt[i - 1];
    }

    DEBUG("range: %d, %d\n", vtxdist[wr], vtxdist[wr + 1]);
    int j = 0;
    xadj.push_back(0);
    for (i = vtxdist[wr]; i < vtxdist[wr + 1]; i++)
    {
        Graph::Node *no = G->get_node(i);
        list<int> *l = no->get_nbrs_ptr();
        list<int>::iterator it = l->begin();

        for (; it != l->end(); ++it)
        {
            adjncy.push_back(*it);
            j++;
        }
        xadj.push_back(j);
    }

    if (METIS_OK != ParMETIS_V3_NodeND(&vtxdist.front(), &xadj.front(), &adjncy.front(), &numflag, options, &ordering.front(), &sizes.front(), &comm))
    {
        FERROR("error occured while processing parmetis, aborting\n");
        MPI_Abort(MPI_COMM_WORLD, -1);
    }

    DEBUG("output from ParMETIS\n");
    double parmet_time = MPI_Wtime() - xtime;

    vector<int> recvbuf;
    n = G->get_num_nodes();
    if (wr == 0)
    {
        recvbuf = vector<int>(n, 0);
    }

    if (MPI_SUCCESS !=
        MPI_Gatherv((void *)&ordering.front(), recvcnt[wr], MPI_INT,
                    (void *)&recvbuf.front(), &recvcnt.front(), &displ.front(), MPI_INT,
                    0, comm))
    {
        FERROR("MPI error occured at Gatherv, Abort!\n");
        MPI_Abort(comm, -1);
    }

    vector<int> eo(n, 0);
    if (wr == 0)
    {
        for (int i = 0; i < n; i++)
        {
            eo[recvbuf[i]] = i;
        }

        FILE *f = fopen(eoname_other, "w");
        for (int i = 0; i < n; i++)
            fprintf(f, "%d\n", eo[i] + 1);
        fclose(f);
        DEBUG("ParMetis NodeND elimination ordering is in : %s\n", eoname_other);
    }

    ordering.clear();
    ordering.resize(recvcnt[wr], 0);

    if (MPI_SUCCESS !=
        MPI_Scatterv ((void *)&eo.front(), &recvcnt.front(), &displ.front(), MPI_INT,
                      (void *)&ordering.front(), recvcnt[wr], MPI_INT,
                      0, comm))
    {
        FERROR("MPI error occured at Scatterv, Abort! \n");
        MPI_Abort(comm, -1);
    }

    DEBUG("Scatterv completed\n");

    Graph::GraphCreatorFile gf;
    Graph::WeightedMutableGraph *wg;
    Graph::GraphEOUtil eoutil;
    Graph::GraphProperties prop;
    list<int>members(ordering.begin(), ordering.end());

    wg = gf.create_component(G, &members, false);
    prop.make_canonical(wg);

    vector<int> ord(recvcnt[wr], 0);
    vector<int> ordsend(recvcnt[wr, 0]);
    double xxtime = MPI_Wtime();
    eoutil.find_elimination_ordering(wg, &ord, GD_AMD, false);
    DEBUG("eo time : %f\n", MPI_Wtime() - xxtime);
    
    int sz = recvcnt[wr];

    for (int i = 0; i < sz; i++)
        ordsend[i] = wg->get_node(ord[i])->get_label();


    recvbuf.assign(n, -1);
    if (MPI_SUCCESS !=
        MPI_Gatherv((void *)&ordsend.front(), recvcnt[wr], MPI_INT, (void *)&recvbuf.front(), &recvcnt.front(), &displ.front(), MPI_INT, 0, comm))
    {
        FERROR("MPI error occured at Gatherv, Abort!\n");
        MPI_Abort(comm, -1);
    }

    double p_amd_time = MPI_Wtime() - xtime;
    if (wr == 0)
    {
        FILE *f = fopen(eoname, "w");
        for (int i = 0; i < n && wr == 0; i++)
            fprintf(f, "%d\n", recvbuf[i]);
        fclose(f);
    } 
    DEBUG("ordering is written into %s\n", eoname);
    DEBUG("%f,%f\n", parmet_time, p_amd_time);

    return 0;
}
int main (int argc, char **argv) {
	FILE *fp;
	double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL;
	double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL;
	int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size;
	int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size;
	int rank, size, sqrt_size, matrices_a_b_dimensions[4];
	MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator;
	MPI_Status status; 

	// used to manage the cartesian grid
	int dimensions[2], periods[2], coordinates[2], remain_dims[2];

	double init_time = 0.0, start;

	MPI_Init(&argc, &argv);
	start = MPI_Wtime();
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* For square mesh */
	sqrt_size = (int)sqrt((double) size);             
	if(sqrt_size * sqrt_size != size){
		if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n");
		MPI_Abort(MPI_COMM_WORLD, -1);
	}

	// create a 2D cartesian grid 
	dimensions[0] = dimensions[1] = sqrt_size;
	periods[0] = periods[1] = 1;    
	MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator);
	MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates);

	// create a row communicator
	remain_dims[0] = 0;            
	remain_dims[1] = 1; 
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator);

	// create a column communicator
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator);

	// getting matrices from files at rank 0 only
	// example: mpiexec -n 64 ./cannon matrix1 matrix2 [test]
	if (rank == 0){
		int row, column;
		if ((fp = fopen (argv[1], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]);
			A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *));
			for (row = 0; row < matrices_a_b_dimensions[0]; row++){
				A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double));
				for (column = 0; column < matrices_a_b_dimensions[1]; column++)
					fscanf(fp, "%lf", &A[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
		if((fp = fopen (argv[2], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]);
			B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *));
			for(row = 0; row < matrices_a_b_dimensions[2]; row++){
				B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *));
				for(column = 0; column < matrices_a_b_dimensions[3]; column++)
					fscanf(fp, "%lf", &B[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// need to check that the multiplication is possible given dimensions 
		// matrices_a_b_dimensions[0] = row size of A
		// matrices_a_b_dimensions[1] = column size of A
		// matrices_a_b_dimensions[2] = row size of B
		// matrices_a_b_dimensions[3] = column size of B
		if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){
			if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", 
					matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// this implementation is limited to cases where thematrices can be partitioned perfectly
		if( matrices_a_b_dimensions[0] % sqrt_size != 0 
				|| matrices_a_b_dimensions[1] % sqrt_size != 0 
				|| matrices_a_b_dimensions[2] % sqrt_size != 0 
				|| matrices_a_b_dimensions[3] % sqrt_size != 0 ){
			if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n"
					"all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n",
					matrices_a_b_dimensions[0],matrices_a_b_dimensions[1],
					matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size );
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
	}

	// send dimensions to all peers
	/* @collectives:
	 * MPI_Broadcast
	 */
	MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator);

	A_rows = matrices_a_b_dimensions[0];
	A_columns = matrices_a_b_dimensions[1];
	B_rows = matrices_a_b_dimensions[2];
	B_columns = matrices_a_b_dimensions[3];

	// local metadata for A
	A_local_block_rows = A_rows / sqrt_size;
	A_local_block_columns = A_columns / sqrt_size;
	A_local_block_size = A_local_block_rows * A_local_block_columns;
	A_local_block = (double *) malloc (A_local_block_size * sizeof(double));

	// local metadata for B
	B_local_block_rows = B_rows / sqrt_size;
	B_local_block_columns = B_columns / sqrt_size;
	B_local_block_size = B_local_block_rows * B_local_block_columns;
	B_local_block = (double *) malloc (B_local_block_size * sizeof(double));

	// local metadata for C
	C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double));
	// C needs to be initialized at 0 (accumulates partial dot-products)
	int i;
	for(i=0; i < A_local_block_rows * B_local_block_columns; i++){
		C_local_block[i] = 0;
	}

	// full arrays only needed at root
	if(rank == 0){
		A_array = (double *) malloc(sizeof(double) * A_rows * A_columns);
		B_array = (double *) malloc(sizeof(double) * B_rows * B_columns);
		C_array = (double *) malloc(sizeof(double) * A_rows * B_columns);
		// generate the 1D arrays of the matrices at root
		int row, column, i, j;
		for (i = 0; i < sqrt_size; i++){
			for (j = 0; j < sqrt_size; j++){
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < A_local_block_columns; column++){
						A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] 
							= A[i * A_local_block_rows + row][j * A_local_block_columns + column];
					}
				}
				for (row = 0; row < B_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] 
							= B[i * B_local_block_rows + row][j * B_local_block_columns + column];
					}
				}
			}
		}
		// allocate output matrix C
		C = (double **) malloc(A_rows * sizeof(double *));
		for(i=0; i<A_rows ;i++){
			C[i] = (double *) malloc(B_columns * sizeof(double));
		}
	} 

	// send a block to each process
	/* @collectives:
	/* MPI_Scatter with sendcount=A/B_local_block_size. The if-else clause and the for-loops can be replaced.
	 */
	{
		//compute displacements
		int row_displs[size]; // displacements for A
		int col_displs[size]; // displacements for B

		int row, col;
		for(row = 0; row < sqrt_size; ++row) {
			for(col = 0; col < sqrt_size; ++col) {
				int i = row*sqrt_size + col;
				if(row != 0) {
					int col_loc = (col + sqrt_size - row) % sqrt_size;
					row_displs[i] =  row*sqrt_size + col_loc;
				} else {
					row_displs[i] = i;
				}
				row_displs[i] *= A_local_block_size;

				if(col != 0) {
					int row_loc = (row + sqrt_size - col) % sqrt_size;
					col_displs[i] = row_loc*sqrt_size + col;
				} else {
					col_displs[i] = i;
				}
				col_displs[i] *= B_local_block_size;
			}
		}

		// set counts for scattering A;
		int counts[size];
		int i;
		for(i = 0; i < size; ++i) {
			counts[i] = A_local_block_size;
		}

		MPI_Scatterv(A_array, counts, row_displs, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);

		for(i = 0; i < size; ++i) {
			counts[i] = B_local_block_size;
		}

		MPI_Scatterv(B_array, counts, col_displs, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);

	}

	init_time += MPI_Wtime() - start;

	// cannon's algorithm
	int cannon_block_cycle;
	double compute_time = 0, mpi_time = 0;
	int C_index, A_row, A_column, B_column;
	for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){
		// compute partial result for this block cycle
		start = MPI_Wtime();
		for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){
			for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){
				for(A_column = 0; A_column < A_local_block_columns; A_column++){
					C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] *
						B_local_block[A_column * B_local_block_columns + B_column];
				}
			}
		}
		compute_time += MPI_Wtime() - start;
		start = MPI_Wtime();
		// rotate blocks horizontally
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, 
				(coordinates[1] + sqrt_size - 1) % sqrt_size, 0, 
				(coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status);
		// rotate blocks vertically
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, 
				(coordinates[0] + sqrt_size - 1) % sqrt_size, 0, 
				(coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status);
		mpi_time += MPI_Wtime() - start;
	}

	// get C parts from other processes at rank 0
	/* @collectives:
	 * MPI_Gather with sendcount=A_local_block_rows * B_local_block_columns
	 */
	double output_time = 0.0;
	start = MPI_Wtime();
	MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
			C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
	        0, cartesian_grid_communicator);

	output_time += MPI_Wtime() - start;

	// generating output at rank 0
	if (rank == 0) {
		// convert the ID array into the actual C matrix 
		int i, j, k, row, column;
		for (i = 0; i < sqrt_size; i++){  // block row index
			for (j = 0; j < sqrt_size; j++){ // block column index
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = 
							C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) 
							+ (row * B_local_block_columns) + column];
					}
				}
			}
		}

		printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns);
		printf("Computation time: %lf\n", compute_time);
		printf("MPI time:         %lf\n", mpi_time);
		printf("Setup time:       %lf\n", init_time);
		printf("Output time:      %lf\n", output_time);

		if (argc == 4){
			// present results on the screen
			printf("\nA( %d x %d ):\n", A_rows, A_columns);
			for(row = 0; row < A_rows; row++) {
				for(column = 0; column < A_columns; column++)
					printf ("%7.3f ", A[row][column]);
				printf ("\n");
			}
			printf("\nB( %d x %d ):\n", B_rows, B_columns);
			for(row = 0; row < B_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ", B[row][column]);
				printf("\n");
			}
			printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns);
			for(row = 0; row < A_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ",C[row][column]);
				printf("\n");
			}


			printf("\nPerforming serial consistency check. Be patient...\n");
			fflush(stdout);
			int pass = 1;
			double temp;
			for(i=0; i<A_rows; i++){
				for(j=0; j<B_columns; j++){
					temp = 0;
					for(k=0; k<B_rows; k++){
						temp += A[i][k] * B[k][j];
					}
					printf("%7.3f ", temp);
					if(temp != C[i][j]){
						pass = 0;
					}
				}
				printf("\n");
			}
			if (pass) printf("Consistency check: PASS\n");
			else printf("Consistency check: FAIL\n");
		}	
	}

	// free all memory
	if(rank == 0){
		int i;
		for(i = 0; i < A_rows; i++){
			free(A[i]);
		}
		for(i = 0; i < B_rows; i++){
			free(B[i]);
		}
		for(i = 0; i < A_rows; i++){
			free(C[i]);
		}
		free(A);
		free(B);
		free(C);
		free(A_array);
		free(B_array);
		free(C_array);
	}
	free(A_local_block);
	free(B_local_block);
	free(C_local_block);

	// finalize MPI
	MPI_Finalize();
}
Exemple #15
0
int main(int argc, char *argv[]) {

  double startTime, endTime;
  int numElements, offset, stripSize, myrank, numnodes, N, i, j, k,x;

  int tamanio[numnodes];
  int desplazamiento[numnodes];

  int resto;
  
  MPI_Init(&argc, &argv);
  
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &numnodes);

  N = atoi(argv[1]);

  resto = N % numnodes;
  
 
  double A[N][N], B[N][N], C[N][N]; 
  double auxA[N][N], auxC[N][N];

  if (myrank == 0) {
    // inicializar A y B
    x=0;
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        A[i][j] = x;
        B[i][j] = x;
	      x++;
      }
    }
  }
  
  // Empezar el contador
  if (myrank == 0) {
    startTime = MPI_Wtime();
  }
  
  numElements = N/numnodes;  //Tamaño de los datos con los que opera cada trabajador.

  desplazamiento[0] = 0;

  for(j=0;j < (numnodes - resto) ; j++){

  	tamanio[j] = numElements * N;
  	desplazamiento[j+1] = desplazamiento[j] + (numElements * N);

  }

  for( j = numnodes - resto ; j < numnodes; j++){
 	  tamanio[j] = (N * (numElements + 1));
    if( j != numnodes - 1){
        desplazamiento[j+1] = desplazamiento[j] + (N * (numElements + 1));
    }
  }

  for(i = 0; i < numnodes ; i++){

  	printf("desplazamiento[%d] = %d \n",i,desplazamiento[i]);
  	printf("tamanio[%d] = %d \n",i,tamanio[i]);
  }


  //El master realiza un envio al resto de workers con la matriz A
  MPI_Scatterv(&A,tamanio,desplazamiento,MPI_DOUBLE, &auxA, tamanio[myrank], MPI_DOUBLE, 0,MPI_COMM_WORLD);
 
  // Todos obtienen B
  MPI_Bcast(&B, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  // Cada proceso inicializa C a 0. 
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      C[i][j] = 0.0;
    }
  }

  // Realizar operaciones
  for (i=0; i<N; i++) {
    for (j=0; j<N; j++) {
      for (k=0; k<N; k++) {
        auxC[i][j] += auxA[i][k] * B[k][j];
      }
    }
  }

  //El master junta los datos de la matriz con la solución
  MPI_Gatherv(&auxC,tamanio[myrank],MPI_DOUBLE,&C,tamanio,desplazamiento,MPI_DOUBLE,0,MPI_COMM_WORLD);

  // para el contador
  if (myrank == 0) {
    endTime = MPI_Wtime();
  }
  
   // Imprime la matriz A
  if (myrank == 0 && N < 10) {
    printf("Matriz A:\n");
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        printf("%f ", A[i][j]);
      }
      printf("\n");
    }
  }
 // Imprime la matriz B 
  if (myrank == 0 && N < 10) {
    printf("\n");
    printf("Matriz B:\n");
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        printf("%f ", B[i][j]);
      }
      printf("\n");
    }
  }


  // Imprime la matriz C 
  if (myrank == 0 && N < 10) {
    printf("\n");
    printf("Matriz C:\n");
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        printf("%f ", C[i][j]);
      }
      printf("\n");
    }
  }

  if (myrank == 0) {
    printf("\n");
    printf("Ha tardado %f segundos.\n\n", endTime-startTime);
    printf("\n");
  }

  MPI_Finalize();
  return 0;
}
Exemple #16
0
int main(int argc, char **argv)
{
	int numprocs, rank, namelen, i;
	char processor_name[MPI_MAX_PROCESSOR_NAME];
	int *vx = NULL, *vy = NULL, *vz = NULL, *vxpart = NULL, *vypart = NULL,
			*vzpart = NULL, coeff[2];
	int exp = 0, act = 0;
	int *count = NULL;
	int *disp = NULL;

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* let the main process initialize the data */
	if (rank == 0) {
		vx = (int *) malloc(sizeof(int) * DIM_GLOBAL);
		vy = (int *) malloc(sizeof(int) * DIM_GLOBAL);
		vz = (int *) malloc(sizeof(int) * DIM_GLOBAL);

		for (i = 0; i < DIM_GLOBAL; i++) {
			vx[i] = i;
			vy[i] = i;
			vz[i] = 0;
			exp += 2 * i + 3 * i;
		}
		coeff[0] = 2;
		coeff[1] = 3;
	}

	/* compute size of chunks */
	sendcounts_array(&count, numprocs, DIM_GLOBAL);
	displs_array(&disp, count, numprocs);

	/* allocate work buffer to all, including the master process */
	vxpart = (int *) malloc(sizeof(int) * count[rank]);
	vypart = (int *) malloc(sizeof(int) * count[rank]);
	vzpart = (int *) malloc(sizeof(int) * count[rank]);

	/* Scatter the data to peers */
	MPI_Scatterv(vx, count, disp, MPI_INTEGER, vxpart, count[rank],
			MPI_INTEGER, 0, MPI_COMM_WORLD);
	MPI_Scatterv(vy, count, disp, MPI_INTEGER, vypart, count[rank],
			MPI_INTEGER, 0, MPI_COMM_WORLD);

	/* Broadcast is done here because coeff is the same for all computations */
	MPI_Bcast(coeff, 2, MPI_INTEGER, 0, MPI_COMM_WORLD);

	/* perform the actual computation */
	for (i = 0; i < count[rank]; i++) {
		vzpart[i] = coeff[0] * vxpart[i] + coeff[1] * vypart[i];
	}

	/* Gather the results */
	MPI_Gatherv(vzpart, count[rank], MPI_INTEGER, vz, count, disp, MPI_INTEGER,
			0, MPI_COMM_WORLD);

	/* verify result */

	if (rank == 0) {
		for (i = 0; i < DIM_GLOBAL; i++) {
			act += vz[i];
		}
		printf("exp=%d act=%d\n", exp, act);
	}
	if (rank == 0) {
		FREE(vx);
		FREE(vy);
		FREE(vz);
	}
	FREE(vxpart);
	FREE(vypart);
	FREE(vzpart);
	FREE(disp);
	FREE(count);
	MPI_Get_processor_name(processor_name, &namelen);
	MPI_Finalize();
	return 0;
}
Exemple #17
0
void GenVector_ReadCSV(denseType * vector, long length, long num_cols, char* rhsFile, int myid, int numprocs) {
    long idx;
    long local_length, local_length_normal;
    int ierr;
    double * Total_data_buffer;

    int sendCount[numprocs];
    int sendDispls[numprocs];

    long procCounter;
    double normel_ele_num;

    ierr = MPI_Bcast((void*) &length, 1, MPI_LONG, 0, MPI_COMM_WORLD);

#ifdef GETRHS_DEBUG
    printf("in GetRHS.c, myid=%d, length=%d\n", myid, length);
#endif

    local_length_normal = length / numprocs;
    if (myid == numprocs - 1)
        local_length = length - (numprocs - 1) * local_length_normal;
    else
        local_length = local_length_normal;
    normel_ele_num = local_length_normal * num_cols;
    for (procCounter = 0; procCounter < numprocs; procCounter++) {
        sendCount[procCounter] = (int)normel_ele_num;
        sendDispls[procCounter] = (int)procCounter * normel_ele_num;
    }
    sendCount[numprocs - 1] = (int)((length - (numprocs - 1) * local_length_normal) * num_cols);

#ifdef GETRHS_DEBUG
    printf("in GetRHS.c, myid = %d, local_length=%d\n", myid, local_length);
#endif
    vector->local_num_row = local_length;
    vector->local_num_col = num_cols; // only consider
    vector->global_num_row = length;
    vector->global_num_col = num_cols;

#ifdef GETRHS_DEBUG

#endif

    vector->data = (double *) calloc(vector->local_num_row * vector->local_num_col, sizeof (double));

    long local_num_element = vector->local_num_row * vector->local_num_col;

    // rank 0 read CSV
    if (myid == 0) {
        printf("Reading MRHS data from %s ... ...\n", rhsFile);
        parseCSV(rhsFile, &Total_data_buffer, length, num_cols);
        printf("Reading MRHS data from %s done.\n", rhsFile);
#ifdef GenVector_ReadCSV_DB
        //check_csv_array_print(Total_data_buffer, length, num_cols, myid);
        //        exit(0);
#endif

    }
    //    // Scatter data

// int MPI_Scatterv(const void *sendbuf, const int *sendcounts, const int *displs,
//                  MPI_Datatype sendtype, void *recvbuf, int recvcount,
//                  MPI_Datatype recvtype,
//                  int root, MPI_Comm comm)

    ierr = MPI_Scatterv((void*) Total_data_buffer, (int*)sendCount, (int*)sendDispls,
            MPI_DOUBLE, vector->data, (int)local_num_element,
            MPI_DOUBLE, 0, MPI_COMM_WORLD);
    //
    //    // based on the assumption of equal division of rows among processes 
    vector->start_idx = myid * vector->global_num_col * local_length_normal;
#ifdef GenVector_ReadCSV_DB
    //    if (myid == 0){
    //        check_csv_array_print(vector->data, vector->local_num_row, vector->global_num_col, myid);
    //        printf ("local rows:%d, local cols %d, local nnz:%d\n", vector->local_num_row, vector->local_num_col,local_num_element);
    //    }
    if (myid == numprocs-1) {
        local_dense_mat_print(*vector, myid);
    }
    exit(0);
#endif    
    if (myid == 0) {
        free(Total_data_buffer);
    }

}
Exemple #18
0
int main(int argc, char *argv[])
{
	int m, n, c, iters, i, j;
	int my_m, my_n, my_rank, num_procs, size;
	float kappa;
	image u, u_bar;
	unsigned char *image_chars;
	char *input_jpeg_filename, *output_jpeg_filename;
	int* sendcounts, displs, recvcounts;

	sendcounts = (int*)malloc(num_procs*sizeof(int));
	displs = (int*)malloc(num_procs*sizeof(int));
	recvcounts = (int*)malloc(num_procs*sizeof(int));

	printf("Now in main program\n");

	MPI_Init (&argc, &argv);
	MPI_Comm_rank (MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size (MPI_COMM_WORLD, &num_procs);

	/* read from kommand line: kappa, iters, input_jpeg filename, output_jpeg_filename */
	kappa = atof(argv[1]);
	iters = atoi(argv[2]);
	input_jpeg_filename = argv[3];
	output_jpeg_filename = argv[4];
	/* Test that parameters are read correctly from command line: 
	printf("kappa: %f\n", kappa);
	printf("iters: %d\n", iters);
	printf("input_jpeg_filename: %s\n", input_jpeg_filename);
	printf("output_jpeg_filename: %s\n", output_jpeg_filename);
	*/

	
	if (my_rank==0)
		import_JPEG_file(input_jpeg_filename, &image_chars, &m, &n, &c);
		printf("Successfully imported JPEG image.\n");
	

	MPI_Bcast (&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Bcast (&n, 1, MPI_INT, 0, MPI_COMM_WORLD);


	/* Divide the m x n pixels evenly among the MPI processes */
	my_m = m/num_procs;
	my_n = n;


	/* If the pixels cannot be evenly divided, the last process picks up 	*/
	/* the remainder.  														*/
	/* Each process needs the rows above and below it. 						*/
	/* The first and last process only need 1 additional row. 				*/
	if (my_rank == num_procs - 1){
		my_m += m % num_procs;
		allocate_image(&u, my_m+1, my_n);
		allocate_image(&u_bar, my_m+1, my_n);
	} else if (my_rank == 0){
		allocate_image(&u_bar, my_m+1, my_n);
	} else {
		allocate_image (&u, my_m+2, my_n);
		allocate_image (&u_bar, my_m+2, my_n);
	}

	/* Each process asks process 0 for a partitioned region */
	/* of image_chars and copy the values into u */

	if (my_rank==0){
		size = (my_m + 1)*my_n;
		sendcounts[my_rank] = size;
		displs[my_rank] = my_rank;
		displs[my_rank+1] = my_n*(my_rank*my_m - 1);
	} else if (my_rank==num_procs-1){
		size = (my_m + 1)*my_n;
		sendcounts[my_rank] = size;
	} else {
		size = (my_m + 2)*my_n;
		sendcounts[my_rank] = size;
		displs[my_rank+1] = my_n*(my_rank*my_m - 1);
	}


	
	MPI_Scatterv(&image_chars, &sendcounts, &displs, MPI_UNSIGNED_CHAR, &u.image_data, size, MPI_UNSIGNED_CHAR,
		0, MPI_COMM_WORLD);

	/* Convert data type from unsigned char to float: */
	for (i=0; i<my_m; i++)
	{
		for (j=0; j<my_n; j++)
		{
			u.image_data[i][j] = (float)u.image_data[i][j];
		}
	}

	iso_diffusion_denoising (&u, &u_bar, kappa, iters);

	/* Each process must convert the data type in u back */
	/* to unsigned char. */
	for (i=0; i<my_m; i++)
	{
		for (j=0; j<my_n; j++)
		{
			u.image_data[i][j] = (unsigned char)u.image_data[i][j];
		}
	}

	/* Each process sends its resulting content of u to process 0 */
	/* Process 0 recieves from each process incoming values and */
	/* copy them into the designated region of image_chars */
	/* ... */


	if (my_rank==0){
		displs[my_rank] = 0;
	
	displs[my_rank+1] = my_rank*my_m*my_n;
	size = my_m*my_n




	if (my_rank==0)
		c = 1;
		export_JPEG_file(output_jpeg_filename, image_chars, m, n, c, 75);
		printf("Successfully exported JPEG image! \n");

	deallocate_image(&u);
	deallocate_image(&u_bar);

	MPI_Finalize ();

	printf("Finished the program!\n");

	return 0;
}