void drop_blocks(int grid[GRID_ROWS][GRID_COLS], free_blocks *blocks) { while (move_blocks(grid, blocks, DOWN)) ; }
void move_blocks(BLOCK_LIST *blocks, int h_change, int v_change, int h_wrap, int v_wrap) { if (blocks != NULL) { ((BLOCK *)(blocks->data))->hpos += h_change; if (((BLOCK *)(blocks->data))->hpos >= h_wrap) { ((BLOCK *)(blocks->data))->hpos -= h_wrap; } else if (((BLOCK *)(blocks->data))->hpos < 0) { ((BLOCK *)(blocks->data))->hpos += h_wrap; } ((BLOCK *)(blocks->data))->vpos += v_change; if (((BLOCK *)(blocks->data))->vpos >= v_wrap) { ((BLOCK *)(blocks->data))->vpos -= v_wrap; } else if (((BLOCK *)(blocks->data))->vpos < 0) { ((BLOCK *)(blocks->data))->vpos += v_wrap; } move_blocks(blocks->next, h_change, v_change, h_wrap, v_wrap); } }
// This file includes routines needed for load balancing. Load balancing is // based on RCB. At each stage, a direction and factor is chosen (factor is // based on the prime factorization of the number of processors) and the // blocks in that group are sorted in that direction and divided into factor // subgroups. Then dots (corresponding to blocks) are moved into the proper // subgroup and the process is repeated with the subgroups until each group // represents a processor. The dots are then moved back to the originating // processor, at which point we know where the blocks need to be moved and // then the blocks are moved. Some of these routines are also used when // blocks need to be coarsened - the coarsening routine determines which // blocks need to be coarsened and those blocks are moved to the processor // where their parent is. void load_balance(void) { int npx1, npy1, npz1, nfac, fac[25], fact; int i, j, m, n, dir, in; double t1, t2, t3, t4, t5, tp, tm, tu; block *bp; tp = tm = tu = 0.0; t3 = t4 = t5 = 0.0; t1 = timer(); for (in = 0, num_dots = 0; in < sorted_index[num_refine+1]; in++) { n = sorted_list[in].n; if ((bp = &blocks[n])->number >= 0) { bp->new_proc = my_pe; if ((num_dots+1) > max_num_dots) { printf("%d ERROR: need more dots\n", my_pe); exit(-1); } dots[num_dots].cen[0] = bp->cen[0]; dots[num_dots].cen[1] = bp->cen[1]; dots[num_dots].cen[2] = bp->cen[2]; dots[num_dots].number = bp->number; dots[num_dots].n = n; dots[num_dots].proc = my_pe; dots[num_dots++].new_proc = 0; } } max_active_dot = num_dots; for (n = num_dots; n < max_num_dots; n++) dots[n].number = -1; npx1 = npx; npy1 = npy; npz1 = npz; nfac = factor(num_pes, fac); for (i = nfac, j = 0; i > 0; i--, j++) { fact = fac[i-1]; dir = find_dir(fact, npx1, npy1, npz1); if (dir == 0) npx1 /= fact; else if (dir == 1) npy1 /= fact; else npz1 /= fact; sort(j, fact, dir); move_dots(j, fact); } // first have to move information from dots back to original core, // then will update processor block is moving to, and then its neighbors for (n = 0; n < num_pes; n++) to[n] = 0; for (m = i = 0; i < max_active_dot; i++) if (dots[i].number >= 0 && dots[i].proc != my_pe) { to[dots[i].proc]++; m++; } num_moved_lb += m; MPI_Allreduce(&m, &n, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD); t4 = timer(); t2 = t4 - t1; if (n) { // Only move dots and blocks if there is something to move MPI_Alltoall(to, 1, MPI_INTEGER, from, 1, MPI_INTEGER, MPI_COMM_WORLD); move_dots_back(); t5 = timer(); t3 = t5 - t4; t4 = t5; move_blocks(&tp, &tm, &tu); } t5 = timer() - t4; timer_lb_misc += timer() - t1 - t2 - t3 - tp - tm - tu; timer_lb_sort += t2; timer_lb_pa += tp; timer_lb_mv += tm; timer_lb_un += tu; timer_lb_mb += t3; timer_lb_ma += t5; }