Ejemplo n.º 1
0
Grid::Grid(const simParams &params, bool debug) {
    debug_ = debug;


    //need to figure out which processor we are and who our neighbors are...
    MPI_SAFE_CALL( MPI_Comm_rank(MPI_COMM_WORLD, &ourRank_) );

    int totalNumProcessors;
    MPI_SAFE_CALL( MPI_Comm_size(MPI_COMM_WORLD, &totalNumProcessors) );

    //based on total number of processors and grid configuration
    //determine our neighbors
    procTop_ = -1;
    procBot_ = -1;

    //1D decomposition - horizontal stripes
    if (ourRank_ > 0) {
        procBot_ = ourRank_ - 1;
    }
    if (ourRank_ < totalNumProcessors - 1) {
        procTop_ = ourRank_ + 1;
    }
    //figure out dimensions and how big we need to allocate
    nx_ = params.nx();

    ny_ = params.ny() / totalNumProcessors;
    if (ourRank_ < params.ny() % totalNumProcessors) // to deal with cases where there is a remainder
        ++ny_;                                       // in the division of ny by the number of procs

    if (params.order() == 2) 
        borderSize_ = 1;
    else if (params.order() == 4)
        borderSize_ = 2;
    else if (params.order() == 8)
        borderSize_ = 4;

    assert(nx_ > 2 * borderSize_);
    assert(ny_ > 2 * borderSize_);

    gx_ = nx_ + 2 * borderSize_;
    gy_ = ny_ + 2 * borderSize_;
   
    if (debug) { 
        printf("%d: (%d, %d) (%d, %d) top: %d bot: %d\n", \
                ourRank_, nx_, ny_, gx_, gy_, procTop_, procBot_);
    }

    grid_.resize(2 * gx_ * gy_);
    
    prev.setMembers(&grid_, gx_, 0);
    curr.setMembers(&grid_, gx_, gx_ * gy_);

    initializeIC(params, ourRank_, totalNumProcessors);
    //create the copy of the grid we need for ping-ponging
    std::copy(grid_.begin(), grid_.begin() + gx_ * gy_, grid_.begin() + gx_ * gy_);
}
Ejemplo n.º 2
0
int main(int argc, char* argv[])
{
	// Parse test command line arguments, perform early
	// initializations.
	const char *name, *mode;
	int n, nt, sx, sy, ss, rank, szcomm;
#ifdef CUDA
	struct cudaDeviceProp props;
#endif
	test_parse(argc, argv, &name, &mode,
		&n, &nt, &sx, &sy, &ss, &rank, &szcomm
#ifdef CUDA
		, &props
#endif
		);

#ifdef CUDA
	int cpu = !strcmp(mode, "CPU");
	int gpu = !strcmp(mode, "GPU");
#else
	int cpu = 1;
	int gpu = 0;
#endif

	// Create test configuration.
	struct test_config_t* t = test_init(
		name, mode, n, nt, sx, sy, ss, rank, szcomm,
		xmin, ymin, zmin, xmax, ymax, zmax,
		bx, by, bs, ex, ey, es
#ifdef CUDA
		, &props
#endif
		);
	
	// Create another test configuration to check correctness.
	struct test_config_t* t_check = NULL;
#ifdef MPI
	if (t->rank == MPI_ROOT_NODE)
#endif
	{
		t_check = test_init(
			name, mode, n, nt, 1, 1, 1, 0, 1,
			xmin, ymin, zmin, xmax, ymax, zmax,
			bx, by, bs, ex, ey, es
#ifdef CUDA
			, &props
#endif
			);
	}
	
	// Generate the initial data disrtibution and load it
	// onto compute nodes.
	integer* array = (integer*)malloc(t->cpu.parent->grid->extsize * sizeof(integer));
	genirand(t->cpu.parent->grid->extsize, array);
	test_load(t, n, sx, sy, ss, sizeof(integer), (char*)array);
#ifdef MPI
	if (t->rank == MPI_ROOT_NODE)
#endif
	{
		size_t nxysb = n * n * n * sizeof(integer);
	
		// Copy the data array.
		memcpy(t_check->cpu.arrays[0], array, nxysb);
		
		// Duplicate initial distribution to the second level array.
		memcpy(t_check->cpu.arrays[1], t_check->cpu.arrays[0], nxysb);
	}
	free(array);
#ifdef VERBOSE
	printf("step 0\n");
	printf("step 1\n");
#endif
	// The time iterations loop, CPU and GPU versions.
	for (int it = 2; it < t->nt; it++)
	{
		// Run one iteration of the stencil, measuring its time.
		// In case of MPI, the time of iteration is measured together
		// with the time of data sync.
		struct timespec start, stop;
#ifdef MPI
		if (t->rank == MPI_ROOT_NODE)
#endif
		{
			stenfw_get_time(&start);
		}
#ifdef MPI
		struct grid_domain_t* subdomains = t->cpu.subdomains;

		int nsubdomains = t->cpu.nsubdomains;

		// Copy the current iteration data into boundary slices
		// and compute stencil in them.
		// Boundary slices themselves are subdomains with respect
		// to each MPI decomposition domains.
		{
			// Set subdomain data copying callbacks:
			// use simple memcpy in this case.
			for (int i = 0; i < nsubdomains; i++)
			{
				struct grid_domain_t* sub = subdomains + i;
				sub->scatter_memcpy = &grid_subcpy;
				sub->gather_memcpy = &grid_subcpy;
			}

			// Scatter domain edges for separate computation.
			grid_scatter(subdomains, &t->cpu, 0, LAYOUT_MODE_CUSTOM);
			
			// Process edges subdomains.
			for (int i = 0; i < nsubdomains; i++)
			{
				struct grid_domain_t* sub = subdomains + i;

				int nx = sub->grid[0].bx + sub->grid[0].nx + sub->grid[0].ex;
				int ny = sub->grid[0].by + sub->grid[0].ny + sub->grid[0].ey;
				int ns = sub->grid[0].bs + sub->grid[0].ns + sub->grid[0].es;

				isum13pt_cpu(nx, ny, ns,
					(integer(*)[ny][nx])sub->arrays[0],
					(integer(*)[ny][nx])sub->arrays[1],
					(integer(*)[ny][nx])sub->arrays[2]);
			}
		}
		
		// Start sharing boundary slices between linked subdomains.
		MPI_Request* reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * 2 * nsubdomains);
		for (int i = 0; i < nsubdomains; i++)
		{
			struct grid_domain_t* subdomain = subdomains + i;
			struct grid_domain_t* neighbor = *(subdomain->links.dense[0]);

			assert(neighbor->grid[1].extsize == subdomain->grid[0].extsize);
		
			int szelem = sizeof(integer);

			size_t dnx = neighbor->grid[1].nx * szelem;			
			size_t dny = neighbor->grid[1].ny;
			size_t dns = neighbor->grid[1].ns;

			size_t snx = subdomain->grid[0].nx * szelem;
			size_t sbx = subdomain->grid[0].bx * szelem;
			size_t sex = subdomain->grid[0].ex * szelem;
			
			size_t sny = subdomain->grid[0].ny, sns = subdomain->grid[0].ns;
			size_t sby = subdomain->grid[0].by, sbs = subdomain->grid[0].bs;
			size_t sey = subdomain->grid[0].ey, ses = subdomain->grid[0].es;

			size_t soffset = sbx + (sbx + snx + sex) *
				(sby + sbs * (sby + sny + sey));

			struct grid_domain_t obuf;
			memset(&obuf, 0, sizeof(struct grid_domain_t));
			obuf.arrays = subdomain->arrays + 1;
			obuf.narrays = 1;
			obuf.offset = 0;
			obuf.grid[0].nx = dnx;
			obuf.grid[0].ny = dny;
			obuf.grid[0].ns = dns;
			obuf.grid->size = dnx * dny * dns;
		
			struct grid_domain_t scpy = *subdomain;
			scpy.arrays = subdomain->arrays + 2;
			scpy.narrays = 1;
			scpy.offset = soffset;
			scpy.grid[0].nx = sbx + snx + sex;
			scpy.grid[0].ny = sby + sny + sey;
			scpy.grid[0].ns = sbs + sns + ses;
			
			// Copy data to the temporary buffer.
			grid_subcpy(dnx, dny, dns, &obuf, &scpy);

			// Exchange temporary buffers with the subdomain neighbour.
			int subdomain_rank = grid_rank1d(subdomain->parent->parent, subdomain->parent->grid);
			int neighbor_rank = grid_rank1d(neighbor->parent->parent, neighbor->parent->grid);
			MPI_SAFE_CALL(MPI_Isend(subdomain->arrays[1], obuf.grid->size,
				MPI_BYTE, neighbor_rank, 0, MPI_COMM_WORLD, &reqs[2 * i]));
			MPI_SAFE_CALL(MPI_Irecv(subdomain->arrays[0], obuf.grid->size,
				MPI_BYTE, neighbor_rank, 0, MPI_COMM_WORLD, &reqs[2 * i + 1]));
#ifdef VERBOSE
			printf("sharing: send %d->%d\n", subdomain_rank, neighbor_rank);
			printf("sharing: recv %d->%d\n", neighbor_rank, subdomain_rank);
#endif
		}
#endif // MPI
		// Compute inner grid points of the subdomain.
		int nx = t->cpu.grid->bx + t->cpu.grid->nx + t->cpu.grid->ex;
		int ny = t->cpu.grid->by + t->cpu.grid->ny + t->cpu.grid->ey;
		int ns = t->cpu.grid->bs + t->cpu.grid->ns + t->cpu.grid->es;

		if (cpu)
		{
			isum13pt_cpu(nx, ny, ns,
				(integer(*)[ny][nx])t->cpu.arrays[0],
				(integer(*)[ny][nx])t->cpu.arrays[1],
				(integer(*)[ny][nx])t->cpu.arrays[2]);	
		}
#ifdef CUDA
		if (gpu)
		{
			isum13pt_gpu(nx, ny, ns,
				(integer*)t->gpu.arrays[0],
				(integer*)t->gpu.arrays[1],
				(integer*)t->gpu.arrays[2]);
#ifdef VISUALIZE
#ifndef CUDA_MAPPED
			// If GPU is not using mapped host memory, then need to fetch
			// the current iteration solution explicitly.
			// TODO: in case of MPI/CUDA/!MAPPED this copy must go AFTER
			// boundaries gathering.
			CUDA_SAFE_CALL(cudaMemcpy(t->cpu.arrays[2], t->gpu.arrays[2],
				t->gpu.grid->extsize * sizeof(real), cudaMemcpyDeviceToHost));
#endif // CUDA_MAPPED
#endif
		}
#endif // CUDA
#ifdef MPI
		// Wait for boundaries sharing completion.
		MPI_Status* statuses = (MPI_Status*)malloc(2 * nsubdomains * sizeof(MPI_Status));
		MPI_SAFE_CALL(MPI_Waitall(2 * nsubdomains, reqs, statuses));
		for (int i = 0; i < 2 * nsubdomains; i++)
			MPI_SAFE_CALL(statuses[i].MPI_ERROR);
		free(statuses);
		free(reqs);
		for (int i = 0; i < nsubdomains; i++)
		{
			struct grid_domain_t* subdomain = subdomains + i;
			
			int szelem = sizeof(integer);

			size_t dnx = subdomain->grid[1].nx * szelem;
			size_t dbx = subdomain->grid[1].bx * szelem;
			size_t dex = subdomain->grid[1].ex * szelem;
			
			size_t dny = subdomain->grid[1].ny, dns = subdomain->grid[1].ns;
			size_t dby = subdomain->grid[1].by, dbs = subdomain->grid[1].bs;
			size_t dey = subdomain->grid[1].ey, des = subdomain->grid[1].es;

			size_t doffset = dbx + (dbx + dnx + dex) *
				(dby + dbs * (dby + dny + dey));

			struct grid_domain_t dcpy = *subdomain;
			dcpy.arrays = subdomain->arrays + 2;
			dcpy.narrays = 1;
			dcpy.offset = doffset;
			dcpy.grid[0].nx = dbx + dnx + dex;
			dcpy.grid[0].ny = dby + dny + dey;
			dcpy.grid[0].ns = dbs + dns + des;

			struct grid_domain_t ibuf;
			memset(&ibuf, 0, sizeof(struct grid_domain_t));
			ibuf.arrays = subdomain->arrays;
			ibuf.narrays = 1;
			ibuf.offset = 0;
			ibuf.grid[0].nx = dnx;
			ibuf.grid[0].ny = dny;
			ibuf.grid[0].ns = dns;
		
			// Copy data to temporary buffer.
			grid_subcpy(dnx, dny, dns, &dcpy, &ibuf);

			// Swap pointers to make the last iteration in the bottom.
			char* w = subdomain->arrays[0];
			subdomain->arrays[0] = subdomain->arrays[2];
			subdomain->arrays[2] = w;
		}

		// Gather bounradies on for the next time step. Insert the
		// separately computed boundaries back into the sudomains
		// for the next time step.
		struct grid_domain_t target = t->cpu;
		target.narrays = 1;
		target.arrays = t->cpu.arrays + 2;
		grid_gather(&target, subdomains, 1, LAYOUT_MODE_CUSTOM);
		
		if (t->rank != MPI_ROOT_NODE)
		{
#ifdef VERBOSE
			printf("step %d\n", it);
#endif
		}
		else
#endif // MPI		
		{
			stenfw_get_time(&stop);
			printf("step %d time = ", it);
			stenfw_print_time_diff(start, stop);
			printf(" sec\n");
		}
#ifdef MPI
		if (t->rank == MPI_ROOT_NODE)
#endif
		{
			// Compute inner grid points of the control solution subdomain.
			int nx = t_check->cpu.grid->bx + t_check->cpu.grid->nx + t_check->cpu.grid->ex;
			int ny = t_check->cpu.grid->by + t_check->cpu.grid->ny + t_check->cpu.grid->ey;
			int ns = t_check->cpu.grid->bs + t_check->cpu.grid->ns + t_check->cpu.grid->es;

			isum13pt_cpu(nx, ny, ns,
				(integer(*)[ny][nx])t_check->cpu.arrays[0],
				(integer(*)[ny][nx])t_check->cpu.arrays[1],
				(integer(*)[ny][nx])t_check->cpu.arrays[2]);
		}

		// Print the stats of difference between the solution and
		// the control solution.
		test_write_imaxabsdiff(t, t_check, 2, it);

		// Swap pointers to rewrite the oldest iteration with
		// the next one.
		char* w = t->cpu.arrays[0];
		t->cpu.arrays[0] = t->cpu.arrays[1];
		t->cpu.arrays[1] = t->cpu.arrays[2];
		t->cpu.arrays[2] = w;
#ifdef CUDA
		if (gpu)
		{
			// Also swap the corresponding GPU arrays pointers.
			w = t->gpu.arrays[0];
			t->gpu.arrays[0] = t->gpu.arrays[1];
			t->gpu.arrays[1] = t->gpu.arrays[2];
			t->gpu.arrays[2] = w;
		}
#endif
#ifdef MPI
		if (t->rank == MPI_ROOT_NODE)
#endif
		{
			// Swap pointers to rewrite the oldest control solution
			// iteration with the next one.
			char* w = t_check->cpu.arrays[0];
			t_check->cpu.arrays[0] = t_check->cpu.arrays[1];
			t_check->cpu.arrays[1] = t_check->cpu.arrays[2];
			t_check->cpu.arrays[2] = w;
		}
	}

	// Dispose the test configurations.
#ifdef MPI
	if (t->rank == MPI_ROOT_NODE)
#endif
	{
		test_dispose(t_check);
	}
	test_dispose(t);

	return 0;
}
Ejemplo n.º 3
0
	static inline void gather(void * sbuf, size_t sz_s ,void * rbuf, size_t sz_r, MPI_Request & req)
	{
		MPI_SAFE_CALL(MPI_Iallgather(sbuf,sz_s,MPI_UNSIGNED_SHORT, rbuf, sz_r, MPI_UNSIGNED_SHORT, MPI_COMM_WORLD,&req));
	}
Ejemplo n.º 4
0
Linkers::~Linkers() {
  if (is_init_) {
    MPI_SAFE_CALL(MPI_Finalize());
  }
}
Ejemplo n.º 5
0
	static inline void gather(void * sbuf, size_t sz_s ,void * rbuf, size_t sz_r, MPI_Request & req)
	{
		MPI_SAFE_CALL(MPI_Iallgather(sbuf,sizeof(T) * sz_s,MPI_BYTE, rbuf, sz_r * sizeof(T), MPI_BYTE, MPI_COMM_WORLD,&req));
	}