// This routine set up the IloCplex algorithm to solve the worker LP, and // creates the worker LP (i.e., the dual of flow constraints and // capacity constraints of the flow MILP) // // Modeling variables: // forall k in V0, i in V: // u(k,i) = dual variable associated with flow constraint (k,i) // // forall k in V0, forall (i,j) in A: // v(k,i,j) = dual variable associated with capacity constraint (k,i,j) // // Objective: // minimize sum(k in V0) sum((i,j) in A) x(i,j) * v(k,i,j) // - sum(k in V0) u(k,0) + sum(k in V0) u(k,k) // // Constraints: // forall k in V0, forall (i,j) in A: u(k,i) - u(k,j) <= v(k,i,j) // // Nonnegativity on variables v(k,i,j) // forall k in V0, forall (i,j) in A: v(k,i,j) >= 0 // void createWorkerLP(IloCplex cplex, IloNumVarArray v, IloNumVarArray u, IloObjective obj, IloInt numNodes) { IloInt i, j, k; IloEnv env = cplex.getEnv(); IloModel mod(env, "atsp_worker"); // Set up IloCplex algorithm to solve the worker LP cplex.extract(mod); cplex.setOut(env.getNullStream()); // Turn off the presolve reductions and set the CPLEX optimizer // to solve the worker LP with primal simplex method. cplex.setParam(IloCplex::Reduce, 0); cplex.setParam(IloCplex::RootAlg, IloCplex::Primal); // Create variables v(k,i,j) forall k in V0, (i,j) in A // For simplicity, also dummy variables v(k,i,i) are created. // Those variables are fixed to 0 and do not partecipate to // the constraints. IloInt numArcs = numNodes * numNodes; IloInt vNumVars = (numNodes-1) * numArcs; IloNumVarArray vTemp(env, vNumVars, 0, IloInfinity); for (k = 1; k < numNodes; ++k) { for (i = 0; i < numNodes; ++i) { vTemp[(k-1)*numArcs + i *numNodes + i].setBounds(0, 0); } } v.clear(); v.add(vTemp); vTemp.end(); mod.add(v); // Set names for variables v(k,i,j) for (k = 1; k < numNodes; ++k) { for(i = 0; i < numNodes; ++i) { for(j = 0; j < numNodes; ++j) { char varName[100]; sprintf(varName, "v.%d.%d.%d", (int) k, (int) i, (int) j); v[(k-1)*numArcs + i*numNodes + j].setName(varName); } } } // Associate indices to variables v(k,i,j) IloIntArray vIndex(env, vNumVars); for (j = 0; j < vNumVars; ++j) { vIndex[j] = j; v[j].setObject(&vIndex[j]); } // Create variables u(k,i) forall k in V0, i in V IloInt uNumVars = (numNodes-1) * numNodes; IloNumVarArray uTemp(env, uNumVars, -IloInfinity, IloInfinity); u.clear(); u.add(uTemp); uTemp.end(); mod.add(u); // Set names for variables u(k,i) for (k = 1; k < numNodes; ++k) { for(i = 0; i < numNodes; ++i) { char varName[100]; sprintf(varName, "u.%d.%d", (int) k, (int) i); u[(k-1)*numNodes + i].setName(varName); } } // Associate indices to variables u(k,i) IloIntArray uIndex(env, uNumVars); for (j = 0; j < uNumVars; ++j) { uIndex[j] = vNumVars + j; u[j].setObject(&uIndex[j]); } // Initial objective function is empty obj.setSense(IloObjective::Minimize); mod.add(obj); // Add constraints: // forall k in V0, forall (i,j) in A: u(k,i) - u(k,j) <= v(k,i,j) for (k = 1; k < numNodes; ++k) { for(i = 0; i < numNodes; ++i) { for(j = 0; j < numNodes; ++j) { if ( i != j ) { IloExpr expr(env); expr -= v[(k-1)*numArcs + i*(numNodes) + j]; expr += u[(k-1)*numNodes + i]; expr -= u[(k-1)*numNodes + j]; mod.add(expr <= 0); expr.end(); } } } } }// END createWorkerLP
void GrayScott::step() { // update step if (world.rank == 0) { ++currStep_; } MPI_Request request[8]; MPI_Status status[8]; // exchange boundaries along y-direction if (world.coord_x % 2 == 0) { // first send top, then botton MPI_Isend(&u_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[0]); MPI_Irecv(&u_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[1]); MPI_Isend(&u_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[2]); MPI_Irecv(&u_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[3]); MPI_Isend(&v_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[4]); MPI_Irecv(&v_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[5]); MPI_Isend(&v_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[6]); MPI_Irecv(&v_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[7]); } else { // first send botton, then top MPI_Irecv(&u_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[0]); MPI_Isend(&u_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[1]); MPI_Irecv(&u_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[2]); MPI_Isend(&u_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[3]); MPI_Irecv(&v_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[4]); MPI_Isend(&v_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[5]); MPI_Irecv(&v_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[6]); MPI_Isend(&v_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[7]); } // u and v at the half step std::vector<double> uTemp(u_.size()); std::vector<double> vTemp(v_.size()); // right hand sides for u and for v std::vector<double> uRhs(N_); std::vector<double> vRhs(N_); /****************** DIFFUSION (ADI) ***************************************/ // perform the first half-step // loop over all rows // parallelize outer loop (y-direction) with mpi, inner loop with openmp (TODO) // inner grid points #pragma omp parallel num_threads(nthreads_) { std::vector<double> puRhs(N_); std::vector<double> pvRhs(N_); #pragma omp for for (int i=1; i<Nx_loc-1; ++i) { // create right-hand side of the systems for (int j=0; j<N_; ++j) { puRhs[j] = U(i,j) + uCoeff * (U(i+1,j) - 2.*U(i,j) + U(i-1,j)); pvRhs[j] = V(i,j) + vCoeff * (V(i+1,j) - 2.*V(i,j) + V(i-1,j)); } TriDiagMatrixSolver::solve(N_, matU1_, puRhs, &UTEMP(i,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, pvRhs, &VTEMP(i,0), 1); } } // omp parallel // wait for boundaries to arrive MPI_Waitall(8,request,status); // update local boundaries if (world.rank == 0) { // i=0 local and global for (int j=0; j<N_; ++j) { uRhs[j] = U(0,j) + uCoeff * (U(1,j) - U(0,j)); vRhs[j] = V(0,j) + vCoeff * (V(1,j) - V(0,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(0,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(0,0), 1); } else { // i=0 local for (int j=0; j<N_; ++j) { uRhs[j] = U(0,j) + uCoeff * (U(0+1,j) - 2.*U(0,j) + U(0-1,j)); vRhs[j] = V(0,j) + vCoeff * (V(0+1,j) - 2.*V(0,j) + V(0-1,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(0,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(0,0), 1); } if (world.rank == world.size-1) { // i=Nx_loc-1 local and i=N_-1 global for (int j=0; j<N_; ++j) { uRhs[j] = U(Nx_loc-1,j) + uCoeff * (- U(Nx_loc-1,j) + U(Nx_loc-2,j)); vRhs[j] = V(Nx_loc-1,j) + vCoeff * (- V(Nx_loc-1,j) + V(Nx_loc-2,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(Nx_loc-1,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(Nx_loc-1,0), 1); } else { // i=Nx_loc-1 local for (int j=0; j<N_; ++j) { uRhs[j] = U(Nx_loc-1,j) + uCoeff * (U(Nx_loc-1+1,j) - 2.*U(Nx_loc-1,j) + U(Nx_loc-1-1,j)); vRhs[j] = V(Nx_loc-1,j) + vCoeff * (V(Nx_loc-1+1,j) - 2.*V(Nx_loc-1,j) + V(Nx_loc-1-1,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(Nx_loc-1,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(Nx_loc-1,0), 1); } MPI_Barrier(MPI_COMM_WORLD); // transpose matrix // TODO either send-datatype also for recieve and then local transpose, or recv-datatype // -> test which faster // transpose global blocks (send from uTemp to u_) // start at Ny_loc, because we ignore the ghost cells if (localtranspose_) { MPI_Alltoall(&uTemp[Ny_loc], 1, block_resized_send, &u_[Ny_loc], 1, block_resized_send, MPI_COMM_WORLD); MPI_Alltoall(&vTemp[Ny_loc], 1, block_resized_send, &v_[Ny_loc], 1, block_resized_send, MPI_COMM_WORLD); // locally transpose blocks #pragma omp parallel num_threads(nthreads_)// for private(ind1) private(ind2) { int ind1, ind2; #pragma omp for for (int b=0; b<Nb_loc; ++b) { for (int i=0; i<Nx_loc; ++i) { for (int j=0; j<i; ++j) { ind1 = (i+1)*Ny_loc + j + b*Nx_loc; // regular index + offset of block ind2 = (j+1)*Ny_loc + i + b*Nx_loc; // switch i and j std::swap(u_[ind1], u_[ind2]); std::swap(v_[ind1], v_[ind2]); } } } } // omp parallel } else { MPI_Alltoall(&uTemp[Ny_loc], 1, block_resized_send, &u_[Ny_loc], 1, block_resized_recv, MPI_COMM_WORLD); MPI_Alltoall(&vTemp[Ny_loc], 1, block_resized_send, &v_[Ny_loc], 1, block_resized_recv, MPI_COMM_WORLD); } // exchange new boundaries if (world.coord_x % 2 == 0) { // first send top, then bottom MPI_Isend(&u_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[0]); MPI_Irecv(&u_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[1]); MPI_Isend(&u_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[2]); MPI_Irecv(&u_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[3]); MPI_Isend(&v_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[4]); MPI_Irecv(&v_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[5]); MPI_Isend(&v_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[6]); MPI_Irecv(&v_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[7]); } else { // first send bottom, then top MPI_Irecv(&u_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[0]); MPI_Isend(&u_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[1]); MPI_Irecv(&u_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[2]); MPI_Isend(&u_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[3]); MPI_Irecv(&v_[0], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[4]); MPI_Isend(&v_[(Ny_loc)], 1, top_boundary, world.top_proc, TAG, cart_comm, &request[5]); MPI_Irecv(&v_[(Nx_loc+1)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[6]); MPI_Isend(&v_[(Nx_loc)*(Ny_loc)], 1, bottom_boundary, world.bottom_proc, TAG, cart_comm, &request[7]); } // inner grid points #pragma omp parallel num_threads(nthreads_) { std::vector<double> puRhs(N_); std::vector<double> pvRhs(N_); #pragma omp for for (int i=1; i<Nx_loc-1; ++i) { // create right-hand side of the systems for (int j=0; j<N_; ++j) { puRhs[j] = U(i,j) + uCoeff * (U(i+1,j) - 2.*U(i,j) + U(i-1,j)); pvRhs[j] = V(i,j) + vCoeff * (V(i+1,j) - 2.*V(i,j) + V(i-1,j)); } TriDiagMatrixSolver::solve(N_, matU1_, puRhs, &UTEMP(i,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, pvRhs, &VTEMP(i,0), 1); } } // omp parallel // wait for boundaries to arrive MPI_Waitall(8,request,status); // update local boundaries // top if (world.rank == 0) { // i=0 local and global for (int j=0; j<N_; ++j) { uRhs[j] = U(0,j) + uCoeff * (U(1,j) - U(0,j)); vRhs[j] = V(0,j) + vCoeff * (V(1,j) - V(0,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(0,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(0,0), 1); } else { // i=0 local, but not globally for (int j=0; j<N_; ++j) { uRhs[j] = U(0,j) + uCoeff * (U(0+1,j) - 2.*U(0,j) + U(0-1,j)); vRhs[j] = V(0,j) + vCoeff * (V(0+1,j) - 2.*V(0,j) + V(0-1,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(0,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(0,0), 1); } // bottom if (world.rank == world.size-1) { // i=Nx_loc-1 local and i=N_-1 global for (int j=0; j<N_; ++j) { uRhs[j] = U(Nx_loc-1,j) + uCoeff * (- U(Nx_loc-1,j) + U(Nx_loc-2,j)); vRhs[j] = V(Nx_loc-1,j) + vCoeff * (- V(Nx_loc-1,j) + V(Nx_loc-2,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(Nx_loc-1,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(Nx_loc-1,0), 1); } else { // i=Nx_loc-1 local for (int j=0; j<N_; ++j) { uRhs[j] = U(Nx_loc-1,j) + uCoeff * (U(Nx_loc-1+1,j) - 2.*U(Nx_loc-1,j) + U(Nx_loc-1-1,j)); vRhs[j] = V(Nx_loc-1,j) + vCoeff * (V(Nx_loc-1+1,j) - 2.*V(Nx_loc-1,j) + V(Nx_loc-1-1,j)); } TriDiagMatrixSolver::solve(N_, matU1_, uRhs, &UTEMP(Nx_loc-1,0), 1); TriDiagMatrixSolver::solve(N_, matV1_, vRhs, &VTEMP(Nx_loc-1,0), 1); } MPI_Barrier(MPI_COMM_WORLD); // transpose back // transpose global blocks (send from uTemp to u_) // start at Ny_loc, because we ignore the ghost cells if (localtranspose_) { MPI_Alltoall(&uTemp[Ny_loc], 1, block_resized_send, &u_[Ny_loc], 1, block_resized_send, MPI_COMM_WORLD); MPI_Alltoall(&vTemp[Ny_loc], 1, block_resized_send, &v_[Ny_loc], 1, block_resized_send, MPI_COMM_WORLD); // locally transpose blocks #pragma omp parallel num_threads(nthreads_)// for private(ind1) private(ind2) { int ind1, ind2; #pragma omp for for (int b=0; b<Nb_loc; ++b) { for (int i=0; i<Nx_loc; ++i) { for (int j=0; j<i; ++j) { ind1 = (i+1)*Ny_loc + j + b*Nx_loc; // regular index + offset of block ind2 = (j+1)*Ny_loc + i + b*Nx_loc; // switch i and j std::swap(u_[ind1], u_[ind2]); std::swap(v_[ind1], v_[ind2]); } } } } // omp parallel } else { MPI_Alltoall(&uTemp[Ny_loc], 1, block_resized_send, &u_[Ny_loc], 1, block_resized_recv, MPI_COMM_WORLD); MPI_Alltoall(&vTemp[Ny_loc], 1, block_resized_send, &v_[Ny_loc], 1, block_resized_recv, MPI_COMM_WORLD); } /****************** REACTION **********************************************/ #pragma omp parallel num_threads(nthreads_) { double uind, vind; #pragma omp for collapse(2) for (int j=0; j<Ny_loc; ++j) { for (int i=0; i<Nx_loc; ++i) { // const int ind = (i+1)*Ny_loc + j; // uind = u_[ind]; // vind = v_[ind]; // u_[ind] += dt_ * ( -uind*vind*vind + F_*(1.-uind) ); // v_[ind] += dt_ * ( uind*vind*vind - (F_+k_)*vind ); uind = U(i,j); vind = V(i,j); U(i,j) += dt_ * ( -uind*vind*vind + F_*(1.-uind) ); V(i,j) += dt_ * ( uind*vind*vind - (F_+k_)*vind ); } } } // omp parallel MPI_Barrier(MPI_COMM_WORLD); }