TA::detail::DistEval<typename Op::result_type, Policy> make_contract_eval( const TA::detail::DistEval<LeftTile, Policy>& left, const TA::detail::DistEval<RightTile, Policy>& right, madness::World& world, const typename TA::detail::DistEval<typename Op::result_type, Policy>::shape_type& shape, const std::shared_ptr<typename TA::detail::DistEval< typename Op::result_type, Policy>::pmap_interface>& pmap, const TA::Permutation& perm, const Op& op) { TA_ASSERT(left.range().rank() == op.left_rank()); TA_ASSERT(right.range().rank() == op.right_rank()); TA_ASSERT((perm.dim() == op.result_rank()) || !perm); // Define the impl type typedef TA::detail::Summa< TA::detail::DistEval<LeftTile, Policy>, TA::detail::DistEval<RightTile, Policy>, Op, Policy> impl_type; // Precompute iteration range data const unsigned int num_contract_ranks = op.num_contract_ranks(); const unsigned int left_end = op.left_rank(); const unsigned int left_middle = left_end - num_contract_ranks; const unsigned int right_end = op.right_rank(); // Construct a vector TiledRange1 objects from the left- and right-hand // arguments that will be used to construct the result TiledRange. Also, // compute the fused outer dimension sizes, number of tiles and elements, // for the contraction. typename impl_type::trange_type::Ranges ranges(op.result_rank()); std::size_t M = 1ul, m = 1ul, N = 1ul, n = 1ul; std::size_t pi = 0ul; for(unsigned int i = 0ul; i < left_middle; ++i) { ranges[(perm ? perm[pi++] : pi++)] = left.trange().data()[i]; M *= left.range().extent(i); m *= left.trange().elements_range().extent(i); } for(std::size_t i = num_contract_ranks; i < right_end; ++i) { ranges[(perm ? perm[pi++] : pi++)] = right.trange().data()[i]; N *= right.range().extent(i); n *= right.trange().elements_range().extent(i); } // Compute the number of tiles in the inner dimension. std::size_t K = 1ul; for(std::size_t i = left_middle; i < left_end; ++i) K *= left.range().extent(i); // Construct the result range typename impl_type::trange_type trange(ranges.begin(), ranges.end()); // Construct the process grid TA::detail::ProcGrid proc_grid(world, M, N, m, n); return TA::detail::DistEval<typename Op::result_type, Policy>( std::shared_ptr<impl_type>( new impl_type(left, right, world, trange, shape, pmap, perm, op, K, proc_grid))); }
void tensor_contract_444(TA::DistArray<Tile, Policy>& tv, const TA::DistArray<Tile, Policy>& t, const TA::DistArray<Tile, Policy>& v) { // for convenience, obtain the tiled ranges for the two kinds of dimensions used to define t, v, and tv auto trange_occ = t.trange().dim(0); // the first dimension of t is occ auto trange_uocc = v.trange().dim(0); // every dimension of v is uocc auto ntiles_occ = trange_occ.tile_extent(); auto ntiles_uocc = trange_uocc.tile_extent(); auto n_occ = trange_occ.extent(); auto n_uocc = trange_occ.extent(); // compute the 2-d grid of processors for the SUMMA // note that the result is (occ occ|uocc uocc), hence the row dimension is occ x occ, etc. auto& world = t.world(); auto nrowtiles = ntiles_occ * ntiles_occ; auto ncoltiles = ntiles_uocc * ntiles_uocc; auto ninttiles = ntiles_uocc * ntiles_uocc; // contraction is over uocc x uocc auto nrows = n_occ * n_occ; auto ncols = n_uocc * n_uocc; TA::detail::ProcGrid proc_grid(world, nrowtiles, ncoltiles, nrows, ncols); std::shared_ptr<TA::Pmap> pmap; auto t_eval = make_array_eval(t, t.world(), TA::DenseShape(), proc_grid.make_row_phase_pmap(ninttiles), TA::Permutation(), make_array_noop<Tile>()); auto v_eval = make_array_eval(v, v.world(), TA::DenseShape(), proc_grid.make_col_phase_pmap(ninttiles), TA::Permutation(), make_array_noop<Tile>()); // // make the result metadata // // result shape TA::TiledRange trange_tv({trange_occ, trange_occ, trange_uocc, trange_uocc}); // pmap.reset(new TA::detail::BlockedPmap(world, trange_tv.tiles_range().volume())); // 'contract' object is of type // PaRSEC's PTG object will do the job here: // 1. it will use t_eval and v_eval's Futures as input // 2. there will be a dummy output ArrayEval, its Futures will be set by the PTG auto contract = make_contract_eval( t_eval, v_eval, world, TA::DenseShape(), pmap, TA::Permutation(), make_contract<Tile>(4u, 4u, 4u) ); // eval() just schedules the Summa task and proceeds // in expressions evaluation is lazy ... you could just use contract tiles // immediately to compose further (in principle even before eval()!) contract.eval(); // since the intent of this function is to return result as a named DistArray // migrate contract's futures to tv here // Create a temporary result array TA::DistArray<Tile,Policy> result(contract.world(), contract.trange(), contract.shape(), contract.pmap()); // Move the data from dist_eval into the result array. There is no // communication in this step. for(const auto index : *contract.pmap()) { if(! contract.is_zero(index)) result.set(index, contract.get(index)); } // uncomment this to block until Summa is complete .. but no need to wait //contract.wait(); // Swap the new array with the result array object. result.swap(tv); }
//c--------------------------------------------------------------------- //c //c Authors: S. Weeratunga //c V. Venkatakrishnan //c E. Barszcz //c M. Yarrow //c C-version: Rob Van der Wijngaart, Intel Corporation //c //c--------------------------------------------------------------------- // // Copyright 2010 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // int RCCE_APP(int argc, char **argv){ //c--------------------------------------------------------------------- //c //c driver for the performance evaluation of the solver for //c five coupled parabolic/elliptic partial differential equations. //c //c--------------------------------------------------------------------- char class; double mflops; int ierr, i, j, k, mm, iverified; //c--------------------------------------------------------------------- //c initialize communications //c--------------------------------------------------------------------- init_comm(&argc, &argv); // RCCE_debug_set(RCCE_DEBUG_SYNCH); //c--------------------------------------------------------------------- //c read input data //c--------------------------------------------------------------------- read_input(); //c--------------------------------------------------------------------- //c set up processor grid //c--------------------------------------------------------------------- proc_grid(); //c--------------------------------------------------------------------- //c determine the neighbors //c--------------------------------------------------------------------- neighbors(); //c--------------------------------------------------------------------- //c set up sub-domain sizes //c--------------------------------------------------------------------- subdomain(); //c--------------------------------------------------------------------- //c set up coefficients //c--------------------------------------------------------------------- setcoeff(); //c--------------------------------------------------------------------- //c set the boundary values for dependent variables //c--------------------------------------------------------------------- setbv(); //c--------------------------------------------------------------------- //c set the initial values for dependent variables //c--------------------------------------------------------------------- setiv(); //c--------------------------------------------------------------------- //c compute the forcing term based on prescribed exact solution //c--------------------------------------------------------------------- erhs(); ////c--------------------------------------------------------------------- ////c perform one SSOR iteration to touch all data and program pages ////c--------------------------------------------------------------------- ssor(1); // ////c--------------------------------------------------------------------- ////c reset the boundary and initial values ////c--------------------------------------------------------------------- setbv(); setiv(); // ////c--------------------------------------------------------------------- ////c perform the SSOR iterations ////c--------------------------------------------------------------------- ssor(itmax); ////c--------------------------------------------------------------------- ////c compute the solution error ////c--------------------------------------------------------------------- error(); ////c--------------------------------------------------------------------- ////c compute the surface integral ////c--------------------------------------------------------------------- pintgr(); // ////c--------------------------------------------------------------------- ////c verification test ////c--------------------------------------------------------------------- if (id ==0) { verify( rsdnm, errnm, &frc, &class ); mflops = (double)(itmax)*(1984.77*(double)( nx0 ) *(double)( ny0 ) *(double)( nz0 ) -10923.3*((double)( nx0+ny0+nz0 )/3.)*((double)( nx0+ny0+nz0 )/3.) +27770.9* (double)( nx0+ny0+nz0 )/3. -144010.) / (maxtime*1000000.); print_results("LU", &class, &nx0, &ny0, &nz0, &itmax, &nnodes_compiled, &num, &maxtime, &mflops, " floating point", &iverified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6); // FILE *perf_file; // char name[50] = "/shared/DEMOS/RCCE/NPB_LU/perf."; // char postfix[50]; // sprintf(postfix, "%d", nnodes_compiled); // strcat(name, postfix); // perf_file = fopen(name,"w"); // fprintf(perf_file, "%d", (int)mflops); // fclose(perf_file); }