// Collect message sizes and times, separating same-rank, same-node, and internode messages static Hashtable<string,Array<const Vector<float,2>>> message_statistics(const vector<vector<Array<const history_t>>>& event_sorted_history, const int ranks_per_node, const int threads_per_rank, const time_kind_t source_kind, const int steps, RawArray<const double> slice_compression_ratio) { GEODE_ASSERT(ranks_per_node>=1); GEODE_ASSERT(threads_per_rank>1); GEODE_ASSERT(vec(request_send_kind,response_send_kind,output_send_kind).contains(source_kind)); const int ranks = CHECK_CAST_INT(event_sorted_history.size())/threads_per_rank; GEODE_ASSERT((int)event_sorted_history.size()==ranks*threads_per_rank); GEODE_ASSERT(slice_compression_ratio.size()==37); GEODE_ASSERT(steps==1 || steps==2); // Separate same-rank, same-node, and internode Vector<Array<Vector<float,2>>,3> data; // Traverse each message and place it in the appropriate bin for (const int source_rank : range(ranks)) { const int source_thread = source_rank*threads_per_rank; for (const history_t& source : event_sorted_history[source_thread][source_kind]) { auto deps = event_dependencies(event_sorted_history,1,source_thread,source_kind,source); GEODE_ASSERT(deps.size()==1); if (steps==2) { GEODE_ASSERT(source_kind == request_send_kind); deps = event_dependencies(event_sorted_history,1,deps[0].x,deps[0].y,deps[0].z); GEODE_ASSERT(deps.size()==1); } const int target_thread = deps[0].x; const int target_rank = target_thread/threads_per_rank; GEODE_ASSERT(target_thread==target_rank*threads_per_rank); const history_t& target = deps[0].z; // Clamp message time to be nonnegative const double time = max(0,target.start.seconds()-source.start.seconds()); // Estimate message size double size; if (source_kind == request_send_kind) size = 8; else { const section_t section = parse_section(source.event); const Vector<uint8_t,4> block = parse_block(source.event); double compression_ratio = 1; if (source_kind == response_send_kind) { compression_ratio = slice_compression_ratio[section.sum()]; GEODE_ASSERT(0<compression_ratio && compression_ratio<1); } size = sizeof(Vector<super_t,2>)*block_shape(section.shape(),block).product()*compression_ratio; } // Add entry const int type = source_rank==target_rank ? 0 : source_rank/ranks_per_node==target_rank/ranks_per_node ? 1 : 2; data[type].append(Vector<float,2>(size,time)); } } // Make a nice hashtable for Python Hashtable<string,Array<const Vector<float,2>>> table; table["same-rank"] = data[0]; table["same-node"] = data[1]; table["different"] = data[2]; return table; }
// Compute rank-to-rank bandwidth estimates localized in time (dimensions: epoch,src,dst) static Array<double,3> estimate_bandwidth(const vector<vector<Array<const history_t>>>& event_sorted_history, const int threads, const double dt_seconds) { Log::Scope scope("estimate bandwidth"); GEODE_ASSERT(threads>1); const int ranks = CHECK_CAST_INT(event_sorted_history.size())/threads; GEODE_ASSERT((int)event_sorted_history.size()==ranks*threads); const double dt = 1e6*dt_seconds; // Count how many epochs we need int64_t elapsed = 0; for (auto& thread : event_sorted_history) for (auto& events : thread) if (events.size()) elapsed = max(elapsed,events.back().end.us); const int epochs = int(ceil(elapsed/dt)); // Last epoch is incomplete // Statics: responses, outputs, total Vector<uint64_t,3> messages; Vector<double,3> total_data, total_time, max_time; int64_t max_time_travel = 0; const double compression_ratio = .35; // Traverse each large message, accumulating total data sent Array<double,3> bandwidths(epochs,ranks,ranks); for (const int target_rank : range(ranks)) for (const int kind : vec(response_recv_kind,output_recv_kind)) for (const history_t& target : event_sorted_history[threads*target_rank][kind]) { const auto deps = event_dependencies(event_sorted_history,-1,threads*target_rank,kind,target); GEODE_ASSERT(deps.size()==1); const int source_thread = deps[0].x; const int source_rank = source_thread/threads; GEODE_ASSERT(source_thread==source_rank*threads); const history_t& source = deps[0].z; const bool which = kind==output_recv_kind; messages[which]++; // Estimate message size const section_t section = parse_section(source.event); const Vector<uint8_t,4> block = parse_block(source.event); const double data_size = sizeof(Vector<super_t,2>)*block_shape(section.shape(),block).product()*(kind==response_recv_kind?compression_ratio:1); total_data[which] += data_size; // Distribute data amongst all overlapped epochs const int64_t time_travel = source.start.us - target.end.us; max_time_travel = max(max_time_travel,time_travel); Box<double> box(source.start.us/dt,target.end.us/dt); if (box.size()<=1e-7) box = Box<double>(box.center()).thickened(.5e-7); total_time[which] += box.size(); max_time[which] = max(max_time[which],box.size()); const double rate = data_size/box.size(); for (const int epoch : range(max(0,int(box.min)),min(epochs,int(box.max)+1))) bandwidths(epoch,source_rank,target_rank) += rate*Box<double>::intersect(box,Box<double>(epoch,epoch+1)).size(); } // Rescale bandwidths /= dt_seconds; // Print statistics cout << "dt = "<<dt_seconds<<" s"<<endl; cout << "elapsed = "<<1e-6*elapsed<<" s"<<endl; cout << "ranks = "<<ranks<<endl; messages[2] = messages.sum(); total_data[2] = total_data.sum(); total_time[2] = total_time.sum(); max_time[2] = max_time.max(); for (int i=0;i<3;i++) { cout << (i==0?"responses:":i==1?"outputs:":"total:") << endl; cout << " messages = "<<messages[i]<<endl; cout << " total data = "<<total_data[i]<<endl; cout << " total time = "<<dt_seconds*total_time[i]<<endl; cout << " average time = "<<dt_seconds*total_time[i]/messages[i]<<endl; cout << " max time = "<<dt_seconds*max_time[i]<<endl; cout << " average bandwidth = "<<total_data[i]/(1e-6*elapsed)<<endl; cout << " average bandwidth / ranks = "<<total_data[i]/(1e-6*elapsed*ranks)<<endl; } cout << "max time travel = "<<1e-6*max_time_travel<<endl; cout << "bandwidth array stats:"<<endl; const double sum = bandwidths.sum(); cout << " sum = "<<sum<<endl; cout << " average rank bandwidth = "<<sum/epochs/ranks<<endl; cout << " average rank-to-rank bandwidth = "<<sum/epochs/sqr(ranks)<<endl; // All done return bandwidths; }
string GEODE_UNUSED str(uint128_t n) { uint64_t lo(n); GEODE_ASSERT(lo==n); return format("%lld",lo); }
static Array<Tuple<time_kind_t,event_t>> dependencies(const int direction, const time_kind_t kind, const event_t event) { GEODE_ASSERT(abs(direction)==1); static_assert(compress_kind==0,"Verify that -kind != kind for kinds we care about"); // Parse event const section_t section = parse_section(event); const auto block = parse_block(event); const uint8_t dimensions = parse_dimensions(event), parent_to_child_symmetry = dimensions>>2, dimension = dimensions&3; const auto ekind = event&ekind_mask; // See mpi/graph for summarized explanation Array<Tuple<time_kind_t,event_t>> deps; switch (direction*kind) { case -allocate_line_kind: { GEODE_ASSERT(ekind==line_ekind); break; } case response_recv_kind: case -request_send_kind: { GEODE_ASSERT(ekind==block_lines_ekind); const auto other_kind = kind==response_recv_kind ? schedule_kind : allocate_line_kind; const auto parent_section = section.parent(dimension).transform(symmetry_t::invert_global(parent_to_child_symmetry)); const auto permutation = section_t::quadrant_permutation(parent_to_child_symmetry); const uint8_t parent_dimension = permutation.find(dimension); const auto block_base = Vector<uint8_t,4>(block.subset(permutation)).remove_index(parent_dimension); deps.append(tuple(other_kind,line_event(parent_section,parent_dimension,block_base))); break; } case request_send_kind: { GEODE_ASSERT(ekind==block_lines_ekind); deps.append(tuple(response_send_kind,event)); break; } case -response_send_kind: case response_send_kind: { GEODE_ASSERT(ekind==block_lines_ekind); deps.append(tuple(direction<0?request_send_kind:response_recv_kind,event)); break; } case -response_recv_kind: { GEODE_ASSERT(ekind==block_lines_ekind); deps.append(tuple(response_send_kind,event)); break; } case allocate_line_kind: case -schedule_kind: { GEODE_ASSERT(ekind==line_ekind); if (section.sum()!=35) { const auto other_kind = kind==allocate_line_kind ? request_send_kind : response_recv_kind; const auto child_section = section.child(dimension).standardize<8>(); const auto permutation = section_t::quadrant_permutation(symmetry_t::invert_global(child_section.y)); const uint8_t child_dimension = permutation.find(dimension); const dimensions_t dimensions(child_section.y,child_dimension); auto child_block = Vector<uint8_t,4>(block.slice<0,3>().insert(0,dimension).subset(permutation)); for (const uint8_t b : range(section_blocks(child_section.x)[child_dimension])) { child_block[child_dimension] = b; deps.append(tuple(other_kind,block_lines_event(child_section.x,dimensions,child_block))); } } break; } case schedule_kind: { GEODE_ASSERT(ekind==line_ekind); deps.append(tuple(compute_kind,event)); // Corresponds to many different microline compute events break; } case -compute_kind: // Note: all microline compute events have the same line event case compute_kind: { GEODE_ASSERT(ekind==line_ekind); deps.append(tuple(direction<0?schedule_kind:wakeup_kind,event)); break; } case -wakeup_kind: { GEODE_ASSERT(ekind==line_ekind); deps.append(tuple(compute_kind,event)); // Corresponds to many different microline compute events break; } case wakeup_kind: { GEODE_ASSERT(ekind==line_ekind); const auto block_base = block.slice<0,3>(); for (const uint8_t b : range(section_blocks(section)[dimension])) deps.append(tuple(output_send_kind,block_line_event(section,dimension,block_base.insert(b,dimension)))); break; } case -output_send_kind: case output_send_kind: { GEODE_ASSERT(ekind==block_line_ekind); if (direction<0) deps.append(tuple(wakeup_kind,line_event(section,dimension,block.remove_index(dimension)))); else deps.append(tuple(output_recv_kind,event)); break; } case -output_recv_kind: case output_recv_kind: { GEODE_ASSERT(ekind==block_line_ekind); deps.append(tuple(direction<0?output_send_kind:snappy_kind,event)); break; } case -snappy_kind: case snappy_kind: { GEODE_ASSERT(ekind==block_line_ekind); if (direction<0) deps.append(tuple(output_recv_kind,event)); break; } default: break; } return deps; }
template<class TV> struct NumpyArrayType<Rotation<TV>>{static PyTypeObject* t;static PyTypeObject* type(){GEODE_ASSERT(t);Py_INCREF(t);return t;}};
RawArray<const Vector<super_t,2>> readable_block_store_t::get_raw_flat(local_id_t local_id) const { const auto& info = block_info.get(local_id); GEODE_ASSERT(!info.missing_dimensions); return all_data.slice(info.nodes); }
compact_blob_t supertensor_index_t::block_location(RawArray<const uint8_t> blob) { compact_blob_t b; GEODE_ASSERT(blob.size()==sizeof(b),format("expected size %d, got size %d, data %s",sizeof(b),blob.size(),str(blob))); memcpy(&b,blob.data(),sizeof(b)); return b; }
T operator()(NdArray<const T> x) const { GEODE_ASSERT(x.shape==xshape); return (*this)(x.flat.reshape(n+3,d)); }
void SurfacePins::add_damping_gradient(SolidMatrix<TV>& matrix) const { GEODE_ASSERT(matrix.size()==mass.size()); GEODE_NOT_IMPLEMENTED(); }
supertensor_index_t::supertensor_index_t(const sections_t& sections) : sections(ref(sections)) , section_offset(make_offsets(sections)) { // Make sure we have a complete slice GEODE_ASSERT(descendent_sections(section_t(),sections.slice).at(sections.slice)->sections==sections.sections); }
// Safely expose snap_divs to python for testing purposes static Array<Quantized> snap_divs_test(RawArray<mp_limb_t,2> values, const bool take_sqrt) { GEODE_ASSERT(values.m && !values.back().contains_only(0)); Array<Quantized> result(values.m-1); snap_divs(result,values,take_sqrt); return result; }
template<class PerturbedT> bool perturbed_ratio(RawArray<Quantized> result, void(*const ratio)(RawArray<mp_limb_t,2>,RawArray<const Vector<Exact<1>,PerturbedT::m>>), const int degree, RawArray<const PerturbedT> X, const bool take_sqrt) { const int m = PerturbedT::m; typedef Vector<Exact<1>,m> EV; const int n = X.size(); const int r = result.size(); if (verbose) cout << "perturbed_ratio:\n degree = "<<degree<<"\n X = "<<X<<endl; // Check if the ratio is nonsingular before perturbation const auto Z = GEODE_RAW_ALLOCA(n,EV); const int precision = degree*Exact<1>::ratio; { for (int i=0;i<n;i++) Z[i] = EV(to_exact(X[i].value())); const auto R = GEODE_RAW_ALLOCA((r+1)*precision,mp_limb_t).reshape(r+1,precision); ratio(R,Z); if (const int sign = mpz_sign(R[r])) { snap_divs(result,R,take_sqrt); return sign>0; } } // Check the first perturbation level with specialized code vector<Vector<ExactInt,m>> Y(n); // perturbations { // Compute the first level of perturbations for (int i=0;i<n;i++) Y[i] = perturbation<m>(1,X[i].seed()); if (verbose) cout << " Y = "<<Y<<endl; // Evaluate polynomial at epsilon = 1, ..., degree const int scaled_precision = precision+factorial_limbs(degree); const auto values = GEODE_RAW_ALLOCA(degree*(r+1)*scaled_precision,mp_limb_t).reshape(degree,r+1,scaled_precision); for (int j=0;j<degree;j++) { for (int i=0;i<n;i++) Z[i] = EV(to_exact(X[i].value())+(j+1)*Y[i]); ratio(values[j],Z); if (verbose) cout << " ratio("<<Z<<") = "<<mpz_str(values[j])<<endl; } // Find interpolating polynomials, overriding the input with the result. for (int k=0;k<=r;k++) { scaled_univariate_in_place_interpolating_polynomial(values.sub<1>(k)); if (verbose) cout << " coefs "<<k<<" = "<<mpz_str(values.sub<1>(k))<<endl; } // Find the largest (lowest degree) nonzero denominator coefficient. If we detect an infinity during this process, explode. for (int j=0;j<degree;j++) { if (const int sign = mpz_sign(values(j,r))) { // We found a nonzero, now compute the rounded ratio snap_divs(result,values[j],take_sqrt); return sign>0; } else for (int k=0;k<r;k++) if (mpz_nonzero(values(j,k))) throw OverflowError(format("perturbed_ratio: infinite result in l'Hopital expansion: %s/0",mpz_str(values(j,k)))); } } { // Add one perturbation level after another until we hit a nonzero denominator. Our current implementation duplicates // work from one iteration to the next for simplicity, which is fine since the first interation suffices almost always. for (int d=2;;d++) { // Compute the next level of perturbations Y.resize(d*n); for (int i=0;i<n;i++) Y[(d-1)*n+i] = perturbation<m>(d,X[i].seed()); // Evaluate polynomial at every point in an "easy corner" const auto lambda = monomials(degree,d); const Array<mp_limb_t,3> values(lambda.m,r+1,precision,uninit); for (int j=0;j<lambda.m;j++) { for (int i=0;i<n;i++) Z[i] = EV(to_exact(X[i].value())+lambda(j,0)*Y[i]); for (int v=1;v<d;v++) for (int i=0;i<n;i++) Z[i] += EV(lambda(j,v)*Y[v*n+i]); ratio(values[j],Z); } // Find interpolating polynomials, overriding the input with the result. for (int k=0;k<=r;k++) in_place_interpolating_polynomial(degree,lambda,values.sub<1>(k)); // Find the largest nonzero denominator coefficient int sign = 0; int nonzero = -1; for (int j=0;j<lambda.m;j++) if (const int s = mpz_sign(values(j,r))) { if (check) // Verify that a term which used to be zero doesn't become nonzero GEODE_ASSERT(lambda(j,d-1)); if (nonzero<0 || monomial_less(lambda[nonzero],lambda[j])) { sign = s; nonzero = j; } } // Verify that numerator coefficients are zero for all large monomials for (int j=0;j<lambda.m;j++) if (nonzero<0 || monomial_less(lambda[nonzero],lambda[j])) for (int k=0;k<r;k++) if (mpz_nonzero(values(j,k))) throw OverflowError(format("perturbed_ratio: infinite result in l'Hopital expansion: %s/0",str(values(j,k)))); // If we found a nonzero, compute the result if (nonzero >= 0) { snap_divs(result,values[nonzero],take_sqrt); return sign>0; } // If we get through two levels without fixing the degeneracy, run a fast, strict identity test to make sure we weren't handed an impossible problem. if (d==2) assert_last_nonzero(ratio,values[0],X,"perturbed_ratio: identically zero denominator"); } } }
template<class PerturbedT> bool perturbed_sign(void(*const predicate)(RawArray<mp_limb_t>,RawArray<const Vector<Exact<1>,PerturbedT::m>>), const int degree, RawArray<const PerturbedT> X) { const int m = PerturbedT::m; typedef Vector<Exact<1>,m> EV; if (check) GEODE_WARNING("Expensive consistency checking enabled"); const int n = X.size(); if (verbose) cout << "perturbed_sign:\n degree = "<<degree<<"\n X = "<<X<<endl; // Check if the predicate is nonsingular without perturbation const auto Z = GEODE_RAW_ALLOCA(n,EV); const int precision = degree*Exact<1>::ratio; { for (int i=0;i<n;i++) Z[i] = EV(to_exact(X[i].value())); const auto R = GEODE_RAW_ALLOCA(precision,mp_limb_t); predicate(R,Z); if (const int sign = mpz_sign(R)) return sign>0; } // Check the first perturbation level with specialized code vector<Vector<ExactInt,m>> Y(n); // perturbations { // Compute the first level of perturbations for (int i=0;i<n;i++) Y[i] = perturbation<m>(1,X[i].seed()); if (verbose) cout << " Y = "<<Y<<endl; // Evaluate polynomial at epsilon = 1, ..., degree const int scaled_precision = precision+factorial_limbs(degree); const auto values = GEODE_RAW_ALLOCA(degree*scaled_precision,mp_limb_t).reshape(degree,scaled_precision); memset(values.data(),0,sizeof(mp_limb_t)*values.flat.size()); for (int j=0;j<degree;j++) { for (int i=0;i<n;i++) Z[i] = EV(to_exact(X[i].value())+(j+1)*Y[i]); predicate(values[j],Z); if (verbose) cout << " predicate("<<Z<<") = "<<mpz_str(values[j])<<endl; } // Find an interpolating polynomial, overriding the input with the result. scaled_univariate_in_place_interpolating_polynomial(values); if (verbose) cout << " coefs = "<<mpz_str(values)<<endl; // Compute sign for (int j=0;j<degree;j++) if (const int sign = mpz_sign(values[j])) return sign>0; } { // Add one perturbation level after another until we hit a nonzero polynomial. Our current implementation duplicates // work from one iteration to the next for simplicity, which is fine since the first interation suffices almost always. for (int d=2;;d++) { if (verbose) cout << " level "<<d<<endl; // Compute the next level of perturbations Y.resize(d*n); for (int i=0;i<n;i++) Y[(d-1)*n+i] = perturbation<m>(d,X[i].seed()); // Evaluate polynomial at every point in an "easy corner" const auto lambda = monomials(degree,d); const Array<mp_limb_t,2> values(lambda.m,precision,uninit); for (int j=0;j<lambda.m;j++) { for (int i=0;i<n;i++) Z[i] = EV(to_exact(X[i].value())+lambda(j,0)*Y[i]); for (int v=1;v<d;v++) for (int i=0;i<n;i++) Z[i] += EV(lambda(j,v)*Y[v*n+i]); predicate(values[j],Z); } // Find an interpolating polynomial, overriding the input with the result. in_place_interpolating_polynomial(degree,lambda,values); // Compute sign int sign = 0; int sign_j = -1; for (int j=0;j<lambda.m;j++) if (const int s = mpz_sign(values[j])) { if (check) // Verify that a term which used to be zero doesn't become nonzero GEODE_ASSERT(lambda(j,d-1)); if (!sign || monomial_less(lambda[sign_j],lambda[j])) { sign = s; sign_j = j; } } // If we find a nonzero sign, we're done! if (sign) return sign>0; // If we get through two levels without fixing the degeneracy, run a fast, strict identity test to make sure we weren't handed an impossible problem. if (d==2) assert_last_nonzero(predicate,values[0],X,"perturbed_sign: identically zero predicate"); } } }
void gradient(RawArray<const T,2> x, RawArray<T,2> grad) const { // Temporary arrays and views GEODE_ASSERT(x.sizes()==vec(n+3,d) && grad.sizes()==x.sizes()); const auto sx = smallx.flat.raw(), sv = smallv.flat.raw(); // Collect quadrature points const int e = 4*d; Array<T,3> tq( n,quads,e,uninit); Array<T,4> xq(vec(n,quads,e,d),uninit); Array<T,4> vq(vec(n,quads,e,d),uninit); for (int i=0;i<n;i++) { T_INFO(i) for (int q=0;q<quads;q++) { const T s = samples[q], t = t1+dt*s; for (int j=0;j<e;j++) tq(i,q,j) = t; SPLINE_INFO(s) for (int a=0;a<d;a++) { X_INFO(i,a) const T x = a0*x0+a1*x1+a2*x2+a3*x3, v = b0*x0+b1*x1+b2*x2+b3*x3; for (int j=0;j<e;j++) { xq(i,q,j,a) = x; vq(i,q,j,a) = v; } } for (int a=0;a<d;a++) { xq(i,q,4*a ,a) -= sx[a]; xq(i,q,4*a+1,a) += sx[a]; vq(i,q,4*a+2,a) -= sv[a]; vq(i,q,4*a+3,a) += sv[a]; } } } // Compute energies const auto Uq_ = U(tq.reshape_own(n*quads*e),NdArray<const T>(q2shape,xq.flat),NdArray<const T>(q2shape,vq.flat)); GEODE_ASSERT(Uq_.size()==n*quads*e); const auto Uq = Uq_.reshape(n,quads,e); // Accumulate grad.fill(0); const auto inv_2s = GEODE_RAW_ALLOCA(d,Vector<T,2>); for (int a=0;a<d;a++) inv_2s[a] = vec(.5/sx[a],.5/sv[a]); for (int i=0;i<n;i++) { T_INFO(i) for (int q=0;q<quads;q++) { const T s = samples[q], w = dt*weights[q]; SPLINE_INFO(s) for (int a=0;a<d;a++) { const T wx = w*inv_2s[a].x*(Uq(i,q,4*a+1)-Uq(i,q,4*a )), wv = w*inv_2s[a].y*(Uq(i,q,4*a+3)-Uq(i,q,4*a+2)); grad(i ,a) += a0*wx+b0*wv; grad(i+1,a) += a1*wx+b1*wv; grad(i+2,a) += a2*wx+b2*wv; grad(i+3,a) += a3*wx+b3*wv; } } } }
template<class TV> Array<TV> TriangleSubdivision::loop_subdivide(RawArray<const TV> X) const { GEODE_ASSERT(X.size()==coarse_mesh->nodes()); Array<TV> fine_X(fine_mesh->nodes(),uninit); loop_matrix()->multiply(X,fine_X); return fine_X; }
void hessian(RawArray<const T,2> x, RawArray<T,4> hess) const { // Temporary arrays and views GEODE_ASSERT(x.sizes()==vec(n+3,d) && hess.sizes()==vec(n+3,4,d,d)); const auto sx = smallx.flat.raw(), sv = smallv.flat.raw(); // Collect quadrature points const int e = 1+8*d+8*d*(d-1); Array<T,3> tq( n,quads,e,uninit); Array<T,4> xq(vec(n,quads,e,d),uninit); Array<T,4> vq(vec(n,quads,e,d),uninit); for (int i=0;i<n;i++) { T_INFO(i) for (int q=0;q<quads;q++) { const T s = samples[q], t = t1+dt*s; for (int j=0;j<e;j++) tq(i,q,j) = t; SPLINE_INFO(s) for (int a=0;a<d;a++) { X_INFO(i,a) const T x = a0*x0+a1*x1+a2*x2+a3*x3, v = b0*x0+b1*x1+b2*x2+b3*x3; for (int j=0;j<e;j++) { xq(i,q,j,a) = x; vq(i,q,j,a) = v; } int j = 1; for (int b=0;b<d;b++) { const T xb = sx[b], vb = sv[b]; xq(i,q,j++,a) -= xb; xq(i,q,j++,a) += xb; vq(i,q,j++,a) -= vb; vq(i,q,j++,a) += vb; xq(i,q,j ,a) -= xb; vq(i,q,j++,a) -= vb; xq(i,q,j ,a) -= xb; vq(i,q,j++,a) += vb; xq(i,q,j ,a) += xb; vq(i,q,j++,a) -= vb; xq(i,q,j ,a) += xb; vq(i,q,j++,a) += vb; for (int c=b+1;c<d;c++) { const T xc = sx[c], vc = sv[c]; xq(i,q,j++,a) -= xb+xc; xq(i,q,j++,a) -= xb-xc; xq(i,q,j++,a) += xb-xc; xq(i,q,j++,a) += xb+xc; vq(i,q,j++,a) -= vb+vc; vq(i,q,j++,a) -= vb-vc; vq(i,q,j++,a) += vb-vc; vq(i,q,j++,a) += vb+vc; vq(i,q,j++,a) -= sv[b]; xq(i,q,j ,a) -= sx[b]; vq(i,q,j++,a) += sv[b]; xq(i,q,j ,a) += sx[b]; vq(i,q,j++,a) -= sv[b]; xq(i,q,j ,a) += sx[b]; vq(i,q,j++,a) += sv[b]; } } } } } // Compute energies const auto Uq_ = U(tq.reshape_own(n*quads*d4),NdArray<const T>(q2shape,xq.flat),NdArray<const T>(q2shape,vq.flat)); GEODE_ASSERT(Uq_.size()==n*quads*d4); const auto Uq = Uq_.reshape(n,quads,d4); // Accumulate grad.fill(0); const auto inv_2s = GEODE_RAW_ALLOCA(d,Vector<T,2>); for (int a=0;a<d;a++) inv_2s[a] = vec(.5/sx[a],.5/sv[a]); for (int i=0;i<n;i++) { T_INFO(i) for (int q=0;q<quads;q++) { const T s = samples[q], w = dt*weights[q]; SPLINE_INFO(s) for (int b=0;b<d;b++) { const T wx = w*inv_2s[b].x*(Uq(i,q,4*b+1)-Uq(i,q,4*b )), wv = w*inv_2s[b].y*(Uq(i,q,4*b+3)-Uq(i,q,4*b+2)); grad(i ,b) += a0*wx+b0*wv; grad(i+1,b) += a1*wx+b1*wv; grad(i+2,b) += a2*wx+b2*wv; grad(i+3,b) += a3*wx+b3*wv; } } } }
RawArray<const uint8_t> readable_block_store_t::get_compressed(local_id_t local_id) const { const auto& info = block_info(local_id); GEODE_ASSERT(!info.missing_dimensions); return store.get_frozen(info.flat_id); }
NdArray<T> gradient(NdArray<const T> x) const { GEODE_ASSERT(x.shape==xshape); NdArray<T> grad(xshape,uninit); gradient(x.flat.reshape(n+3,d),grad.flat.reshape(n+3,d)); return grad; }
template<class TV> struct NumpyDescr<Rotation<TV>>{static PyArray_Descr* d;static PyArray_Descr* descr(){GEODE_ASSERT(d);Py_INCREF(d);return d;}};