void RpalParser::T() { pushProc("T()"); Ta(); int n = 1; while (_nt == ",") { read_token(","); Ta(); n++; } if (n > 1) { build("tau", n); } popProc("T()"); }
static void* _OnePass_HFromH( void* ithr ) { CThrdat &me = vthr[(int)(long)ithr]; int i1[5] = { 0, 1, 2, 6, 7 }, i2[5] = { 3, 4, 5, 6, 7 }; for( int i = me.r0; i < me.rlim; ++i ) { const RGN& R = vRgn[i]; if( R.itr < 0 ) continue; int nc = myc[i].size(); if( nc < 4 ) continue; double *RHS = &(*Xout)[R.itr * 8]; double LHS[8*8]; THmgphy Ta( &(*Xin)[R.itr * 8] ); THmgphy Tb; int lastb = -1; // cache Tb memset( RHS, 0, 8 * sizeof(double) ); memset( LHS, 0, 8*8 * sizeof(double) ); for( int j = 0; j < nc; ++j ) { const Constraint& C = vAllC[myc[i][j]]; Point A, B; // Mixing old and new solutions is related to // "successive over relaxation" methods in other // iterative solution schemes. Experimentally, // I like w = 0.9 (same layer), 0.9 (down). if( C.r1 == i ) { int bitr = vRgn[C.r2].itr; if( bitr < 0 ) continue; if( C.r2 != lastb ) { Tb.CopyIn( &(*Xin)[bitr * 8] ); lastb = C.r2; } Tb.Transform( B = C.p2 ); Ta.Transform( A = C.p1 ); B.x = w * B.x + (1 - w) * A.x; B.y = w * B.y + (1 - w) * A.y; A = C.p1; } else { int bitr = vRgn[C.r1].itr; if( bitr < 0 ) continue; if( C.r1 != lastb ) { Tb.CopyIn( &(*Xin)[bitr * 8] ); lastb = C.r1; } Tb.Transform( B = C.p1 ); Ta.Transform( A = C.p2 ); B.x = w * B.x + (1 - w) * A.x; B.y = w * B.y + (1 - w) * A.y; A = C.p2; } double v[5] = { A.x, A.y, 1.0, -A.x*B.x, -A.y*B.x }; AddConstraint_Quick( LHS, RHS, 8, 5, i1, v, B.x ); v[3] = -A.x*B.y; v[4] = -A.y*B.y; AddConstraint_Quick( LHS, RHS, 8, 5, i2, v, B.y ); } if( gpass < EDITDELAY ) { Solve_Quick( LHS, RHS, 8 ); continue; } if( !Solve_Quick( LHS, RHS, 8 ) || THmgphy( RHS ).Squareness() > SQRTOL ) { HFromH_SLOnly( RHS, i, (int)(long)ithr ); } } return NULL; }
TA::Tensor<float> SchwarzScreen::norm_estimate( madness::World &world, std::vector<gaussian::Basis> const &bs_array, TA::Pmap const &pmap, bool replicate) const { const auto ndims = bs_array.size(); auto trange = gaussian::detail::create_trange(bs_array); auto norms = TA::Tensor<float>(trange.tiles_range(), 0.0); if (ndims == 3) { auto const &Ta = Qbra_->Qtile(); auto const &Tbc = Qket_->Qtile(); auto ord = 0ul; for (auto a = 0l; a < Ta.size(); ++a) { const float a_val = Ta(a); for (auto b = 0l; b < Tbc.rows(); ++b) { for (auto c = 0l; c < Tbc.cols(); ++c, ++ord) { if (pmap.is_local(ord)){ norms[ord] = std::sqrt(a_val * Tbc(b, c)); } } } } } else if (ndims == 4) { auto const &Tab = Qbra_->Qtile(); auto const &Tcd = Qket_->Qtile(); auto ord = 0ul; for (auto a = 0l; a < Tab.rows(); ++a) { for (auto b = 0l; b < Tab.cols(); ++b) { const float ab = Tab(a, b); for (auto c = 0l; c < Tcd.rows(); ++c) { for (auto d = 0l; d < Tcd.cols(); ++d, ++ord) { if (pmap.is_local(ord)) { norms[ord] = std::sqrt(ab * Tcd(c, d)); } } } } } } else { norms = Screener::norm_estimate(world, bs_array, pmap); } world.gop.fence(); // If we want to replicate and the size of the tensor is larger than max_int // then we will have to do it using multiple sums. This is necessary because // MPI_ISend can only send an integer (int) number of things in a single // message. if (replicate) { // construct the sum // First get the size in a 64 bit int, if that overflows then it probably // wasn't going to fit on 1 node anyways (2017, maybe one day I'll be wrong) int64_t size = norms.size(); const int64_t int_max = std::numeric_limits<int>::max(); if (size < int_max) { // If size fits into an int then life is easy world.gop.sum(norms.data(), size); } else { // Blah, testing on NewRiver gave failures when trying to write in chunks // of both int_max and int_max/2. For now I'll be conservative and just // write in small chunks. Writing in chunks of int_max/10 is slow, but // worked on NR. const int64_t write_size = int_max / 10; auto i = 0; while (size > write_size) { const auto next_ptr = norms.data() + i * write_size; world.gop.sum(next_ptr, write_size); size -= write_size; ++i; } // get the remaining elements world.gop.sum(norms.data() + i * write_size, size); } } world.gop.fence(); return norms; }
static void HFromH_SLOnly( double *RHS, int i, int ithr ) { const RGN& R = vRgn[i]; int i1[5] = { 0, 1, 2, 6, 7 }, i2[5] = { 3, 4, 5, 6, 7 }; int nc = myc[i].size(); double LHS[8*8]; THmgphy Ta( &(*Xin)[R.itr * 8] ); THmgphy Tb; int lastb = -1, // cache Tb nSLc = 0; memset( RHS, 0, 8 * sizeof(double) ); memset( LHS, 0, 8*8 * sizeof(double) ); for( int j = 0; j < nc; ++j ) { const Constraint& C = vAllC[myc[i][j]]; Point A, B; // Mixing old and new solutions is related to // "successive over relaxation" methods in other // iterative solution schemes. Experimentally, // I like w = 0.9 (same layer), 0.9 (down). if( C.r1 == i ) { if( vRgn[C.r2].z != R.z ) continue; int bitr = vRgn[C.r2].itr; if( bitr < 0 ) continue; if( C.r2 != lastb ) { Tb.CopyIn( &(*Xin)[bitr * 8] ); lastb = C.r2; } Tb.Transform( B = C.p2 ); Ta.Transform( A = C.p1 ); B.x = w * B.x + (1 - w) * A.x; B.y = w * B.y + (1 - w) * A.y; A = C.p1; } else { if( vRgn[C.r1].z != R.z ) continue; int bitr = vRgn[C.r1].itr; if( bitr < 0 ) continue; if( C.r1 != lastb ) { Tb.CopyIn( &(*Xin)[bitr * 8] ); lastb = C.r1; } Tb.Transform( B = C.p1 ); Ta.Transform( A = C.p2 ); B.x = w * B.x + (1 - w) * A.x; B.y = w * B.y + (1 - w) * A.y; A = C.p2; } ++nSLc; double v[5] = { A.x, A.y, 1.0, -A.x*B.x, -A.y*B.x }; AddConstraint_Quick( LHS, RHS, 8, 5, i1, v, B.x ); v[3] = -A.x*B.y; v[4] = -A.y*B.y; AddConstraint_Quick( LHS, RHS, 8, 5, i2, v, B.y ); } if( nSLc < 4 || !Solve_Quick( LHS, RHS, 8 ) || THmgphy( RHS ).Squareness() > SQRTOL ) { vthr[ithr].Rkil.push_back( i ); } else vthr[ithr].Rslo.push_back( i ); }