static inline void check_wrap(const cmuk::KConstants& kc, vec3f& q, cmuk::LegIndex leg) { float min[2], max[2]; min[0] = jl(kc, leg, cmuk::HIP_RY, 0); min[1] = jl(kc, leg, cmuk::KNEE_RY, 0); max[0] = jl(kc, leg, cmuk::HIP_RY, 1); max[1] = jl(kc, leg, cmuk::KNEE_RY, 1); for (int i=0; i<2; ++i) { float& angle = q[i+1]; float b_old = compute_badness(angle, min[i], max[i]); if (!b_old) { continue; } float a_new = angle; if (b_old > 0) { a_new = angle - 2*M_PI; } else { a_new = angle + 2*M_PI; } float b_new = compute_badness(a_new, min[i], max[i]); if (fabs(b_new) < fabs(b_old)) { angle = a_new; } } }
static bool check_limits(const cmuk::KConstants& kc, vec3f& angles, int leg) { bool rval = true; for (int i=0; i<3; ++i) { const float& min = jl(kc, leg, i, 0); const float& max = jl(kc, leg, i, 1); if (angles[i] < min) { angles[i] = min; rval = false; } if (angles[i] > max) { angles[i] = max; rval = false; } } return rval; }
Real MR::Omega(size_t i, size_t _j, size_t _l) { sub_nb jl(G.nb(i).size()); jl.set(_j); jl.set(_l); Real Tijl = T(i,jl); return Tijl / (1.0 + tJ[i][_l] * M[i][_l] * Tijl); }
/// Calculate the (2l+1)*A_{n,l} coefficients for each Lorentzian std::vector<double> InelasticDiffSphere::LorentzianCoefficients(double a) const { // precompute the 2+m_lmax spherical bessel functions (26 in total) std::vector<double> jl(2 + m_lmax); for (size_t l = 0; l < 2 + m_lmax; l++) { jl[l] = boost::math::sph_bessel(static_cast<unsigned int>(l), a); } // store the coefficient of each Lorentzian in vector YJ(a,w) size_t ncoeff = m_xnl.size(); std::vector<double> YJ(ncoeff); for (size_t i = 0; i < ncoeff; i++) { double x = m_xnl[i].x; unsigned int l = static_cast<unsigned int>(m_xnl[i].l); double J; if (fabs(a - x) > m_divZone) { J = (a * jl[l + 1] - l * jl[l]) / (a * a - x * x); } else { J = m_linearJlist[i].slope * a + m_linearJlist[i].intercept; // linear interpolation instead } YJ[i] = m_alpha[i] * (J * J); } return YJ; } // end of LorentzianCoefficients
CMUK_ERROR_CODE cmuk::getJointLimits( LegIndex leg, vec3f* minrot, vec3f* maxrot ) const { if ((int)leg < 0 || (int) leg >= NUM_LEGS) { return CMUK_BAD_LEG_INDEX; } if (!minrot || !maxrot) { return CMUK_INSUFFICIENT_ARGUMENTS; } for (int i=0; i<3; ++i) { (*minrot)[i] = jl(_kc, leg, i, 0); (*maxrot)[i] = jl(_kc, leg, i, 1); } return CMUK_OKAY; }
void SimplexParameters::update(double y, const MnAlgebraicVector& p) { theSimplexParameters[jh()] = std::pair<double, MnAlgebraicVector>(y, p); if(y < theSimplexParameters[jl()].first) theJLow = jh(); unsigned int jh = 0; for(unsigned int i = 1; i < theSimplexParameters.size(); i++) { if(theSimplexParameters[i].first > theSimplexParameters[jh].first) jh = i; } theJHigh = jh; return; }
CMUK_ERROR_CODE cmuk::computeFootIK( LegIndex leg, const vec3f& pos, vec3f* q_bent_forward, vec3f* q_bent_rearward ) const { if ((int)leg < 0 || (int)leg >= NUM_LEGS) { return CMUK_BAD_LEG_INDEX; } else if (!q_bent_forward || !q_bent_rearward) { return CMUK_INSUFFICIENT_ARGUMENTS; } debug << "*** computing IK...\n"; int hipflags = 0; // subtract off hip position vec3f p = pos - jo(_kc, leg, HIP_RX_OFFSET, _centeredFootIK); vec3f orig = pos; // get dist from hip rx joint to y rotation plane const float& d = jo(_kc, leg, HIP_RY_OFFSET, _centeredFootIK)[1]; // get the squared length of the distance on the plane float yz = p[1]*p[1] + p[2]*p[2]; // alpha is the angle of the foot in the YZ plane with respect to the Y axis float alpha = atan2(p[2], p[1]); // h is the distance of foot from hip in YZ plane float h = sqrt(yz); // beta is the angle between the foot-hip vector (projected in YZ // plane) and the top hip link. float cosbeta = d / h; debug << "p = " << p << ", d = " << d << ", yz = " << yz << "\nalpha = " << alpha << ", h = " << h << ", cosbeta=" << cosbeta << "\n"; if (fabs(cosbeta) > 1) { debug << "violated triangle inequality when calculating hip_rx_angle!\n" ; if (fabs(cosbeta) - 1 > 1e-4) { hipflags = hipflags | IK_UPPER_DISTANCE; } cosbeta = (cosbeta < 0) ? -1 : 1; if (yz < 1e-4) { p[1] = d; p[2] = 0; } else { float scl = fabs(d) / h; p[1] *= scl; p[2] *= scl; orig = p + jo(_kc, leg, HIP_RX_OFFSET, _centeredFootIK); } } float beta = acos(cosbeta); // Now compute the two possible hip angles float hip_rx_angles[2], badness[2]; int flags[2]; flags[0] = hipflags; flags[1] = hipflags; hip_rx_angles[0] = fix_angle(alpha - beta, -M_PI, M_PI); hip_rx_angles[1] = fix_angle(alpha + beta, -M_PI, M_PI); const float& min = jl(_kc, leg, HIP_RX, 0); const float& max = jl(_kc, leg, HIP_RX, 1); // See how badly we violate the joint limits for this hip angles for (int i=0; i<2; ++i) { float& angle = hip_rx_angles[i]; badness[i] = fabs(compute_badness(angle, min, max)); if (badness[i]) { flags[i] = flags[i] | IK_UPPER_ANGLE_RANGE; } } // Put the least bad (and smallest) hip angle first bool swap = false; if ( badness[1] <= badness[0] ) { // We want the less bad solution for hip angle swap = true; } else if (badness[0] == 0 && badness[1] == 0) { // We want the solution for hip angle that leaves the hip up. if ((leg == FL || leg == HL) && hip_rx_angles[0] > hip_rx_angles[1]) { swap = true; } else if ((leg == FR || leg == HR) && hip_rx_angles[0] < hip_rx_angles[1]) { swap = true; } } if (swap) { std::swap(hip_rx_angles[0], hip_rx_angles[1]); std::swap(badness[0], badness[1]); std::swap(flags[0], flags[1]); } int hip_solution_cnt = 2; if (badness[0] == 0 && badness[1] != 0) { hip_solution_cnt = 1; } debug << "hip_rx_angles[0]=" << hip_rx_angles[0] << ", badness=" << badness[0] << ", flags=" << flags[0] << "\n"; debug << "hip_rx_angles[1]=" << hip_rx_angles[1] << ", badness=" << badness[1] << ", flags=" << flags[1] << "\n"; debug << "hip_solution_cnt = " << hip_solution_cnt << "\n"; vec3f qfwd[2], qrear[2]; for (int i=0; i<hip_solution_cnt; ++i) { debug << "** computing ll solution " << (i+1) << " of " << (hip_solution_cnt) << "\n"; float hip_rx = hip_rx_angles[i]; // now make inv. transform to get rid of hip rotation Transform3f tx = Transform3f::rx(hip_rx, jo(_kc, leg, HIP_RX_OFFSET, _centeredFootIK)); vec3f ptx = tx.transformInv(orig); debug << "tx=[" << tx.translation() << ", " << tx.rotation() << "], ptx = " << ptx << "\n"; // calculate lengths for cosine law float l1sqr = ol2(_kc, leg, KNEE_RY_OFFSET, _centeredFootIK); float l2sqr = ol2(_kc, leg, FOOT_OFFSET, _centeredFootIK); float l1 = ol(_kc, leg, KNEE_RY_OFFSET, _centeredFootIK); float l2 = ol(_kc, leg, FOOT_OFFSET, _centeredFootIK); float ksqr = ptx[0]*ptx[0] + ptx[2]*ptx[2]; float k = sqrt(ksqr); debug << "l1=" << l1 << ", l2=" << l2 << ", k=" << k << "\n"; // check triangle inequality if (k > l1 + l2) { debug << "oops, violated the triangle inequality for lower segments: " << "k = " << k << ", " << "l1 + l2 = " << l1 + l2 << "\n"; if (k - (l1 + l2) > 1e-4) { flags[i] = flags[i] | IK_LOWER_DISTANCE; } k = l1 + l2; ksqr = k * k; } // 2*theta is the acute angle formed by the spread // of the two hip rotations... float costheta = (l1sqr + ksqr - l2sqr) / (2 * l1 * k); if (fabs(costheta) > 1) { debug << "costheta = " << costheta << " > 1\n"; if (fabs(costheta) - 1 > 1e-4) { flags[i] = flags[i] | IK_LOWER_DISTANCE; } costheta = (costheta < 0) ? -1 : 1; } float theta = acos(costheta); // gamma is the angle of the foot with respect to the z axis float gamma = atan2(-ptx[0], -ptx[2]); // hip angles are just offsets off of gamma now float hip_ry_1 = gamma - theta; float hip_ry_2 = gamma + theta; // phi is the obtuse angle of the parallelogram float cosphi = (l1sqr + l2sqr - ksqr) / (2 * l1 * l2); if (fabs(cosphi) > 1) { debug << "cosphi = " << cosphi << " > 1\n"; if (fabs(cosphi) - 1 > 1e-4) { flags[i] = flags[i] | IK_LOWER_DISTANCE; } cosphi = (cosphi < 0) ? -1 : 1; } float phi = acos(cosphi); // epsilon is the "error" caused by not having feet offset directly // along the z-axis (if they were, epsilon would equal zero) float epsilon = le(_kc, leg, _centeredFootIK); // now we can directly solve for knee angles float knee_ry_1 = M_PI - phi - epsilon; float knee_ry_2 = -M_PI + phi - epsilon; // now fill out angle structs and check limits qfwd[i] = vec3f(hip_rx, hip_ry_1, knee_ry_1); qrear[i] = vec3f(hip_rx, hip_ry_2, knee_ry_2); debug << "before wrap, qfwd = " << qfwd[i] << "\n"; debug << "before wrap, qrear = " << qrear[i] << "\n"; check_wrap(_kc, qfwd[i], leg); check_wrap(_kc, qrear[i], leg); debug << "after wrap, qfwd = " << qfwd[i] << "\n"; debug << "after wrap, qrear = " << qrear[i] << "\n"; if (!check_limits(_kc, qfwd[i], leg)) { debug << "violated limits forward!\n"; flags[i] = flags[i] | IK_LOWER_ANGLE_RANGE_FWD; } if (!check_limits(_kc, qrear[i], leg)) { debug << "violated limits rearward!\n"; flags[i] = flags[i] | IK_LOWER_ANGLE_RANGE_REAR; } } // for each viable hip solution int best = 0; if (hip_solution_cnt == 2) { if (howbad(flags[0]) > howbad(flags[1])) { best = 1; } debug << "best overall solution is " << (best+1) << "\n"; } *q_bent_forward = qfwd[best]; *q_bent_rearward = qrear[best]; return flags_to_errcode(flags[best]); }
void forward_avx2() { xor_(reg_soff, reg_soff); Label mb_sp_loop; L(mb_sp_loop); { channel_loop([=](size_t unroll) { // Load 32 channels (two C16_blocks) in ymm, then // split the work in half, each half splits in two // regs with 8 channels per. When down converting, // put the result in a temp register for the 1st // iteration, combine the result at 2nd iteration // and store ymm with 32 channels. // If 16 channels, do just one half and store the // result with mask. Vmm v0 = Vmm(0); Vmm v1 = Vmm(1); Vmm vscale0 = Vmm(2); Vmm vshift0 = Vmm(3); Vmm vmean0 = Vmm(4); Vmm vsqrtvar0 = Vmm(5); Vmm vscale1 = Vmm(6); Vmm vshift1 = Vmm(7); Vmm vmean1 = Vmm(8); Vmm vsqrtvar1 = Vmm(9); Vmm tmp = Vmm(10); for (size_t i = 0; i < unroll; i++) { compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, i * c_in_xmm_ * sizeof(float)); compute_vscaleshift(vscale1, vshift1, vmean1, vsqrtvar1, i * c_in_xmm_ * sizeof(float) + simd_w_ * sizeof(float)); vpmovsxbd(v0, src_ptr(i*c_in_xmm_)); vpmovsxbd(v1, src_ptr(i*c_in_xmm_ + simd_w_)); vcvtdq2ps(v0, v0); vcvtdq2ps(v1, v1); uni_vfmadd213ps(v0, vscale0, vshift0); uni_vfmadd213ps(v1, vscale1, vshift1); if (with_relu_) { uni_vmaxps(v0, v0, vzero); uni_vmaxps(v1, v1, vzero); } vcvtps2dq(v0, v0); // BA vcvtps2dq(v1, v1); // DC vpackssdw(v0, v0, v1); // BA + DC -> DBCA vpermq(v0, v0, 0xD8); // DBCA -> DCBA vperm2i128(v1, v0, v0, 0x1); // DCBA -> BADC vpacksswb(v0, v0, v1); // DCBA + BADC -> badcDCBA if (i == 0 && unroll != 1) uni_vmovups(tmp, v0); else if (i == 1) { // badcDCBA + fehgHGFE -> HGFEDCBA vperm2i128(v0, v0, tmp, 0x2); } } if (unroll == 1) vmaskmovps(dst_ptr(), vbody_mask, v0); else uni_vmovups(dst_ptr(), v0); }, [=]() { // handle first 8 channels. If tail is bigger, // handle second part separately. There is no way // to get performance as one has to work with bytes // via xmm. vzeroupper kills all the perf. Xmm x0 = Xmm(0); Vmm v0 = Vmm(0); Vmm vscale0 = Vmm(1); Vmm vshift0 = Vmm(2); Vmm vmean0 = Vmm(3); Vmm vsqrtvar0 = Vmm(4); size_t tail = nstl::min(c_tail_, simd_w_); size_t num_iters = c_tail_ > simd_w_ ? 2 : 1; for (size_t i = 0; i < num_iters; i++) { if (i > 0) tail = c_tail_ - simd_w_; for (size_t tl = 0; tl < tail; tl++) vpinsrb(x0, x0, src_ptr(8*i + tl), tl); if (tail == simd_w_) compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, 32*i); else compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, 32*i, true); vpmovsxbd(v0, x0); vcvtdq2ps(v0, v0); uni_vfmadd213ps(v0, vscale0, vshift0); if (with_relu_) uni_vmaxps(v0, v0, vzero); vcvtps2dq(v0, v0); vpackssdw(v0, v0, vzero); vpermq(v0, v0, 0xD8); vpacksswb(v0, v0, vzero); for (size_t tl = 0; tl < tail; tl++) vpextrb(dst_ptr(8*i + tl), x0, tl); } }); add(reg_soff, reg_coff_max); cmp(reg_soff, reg_soff_max); jl(mb_sp_loop); } }
void forward_avx512() { xor_(reg_soff, reg_soff); Label mb_sp_loop; L(mb_sp_loop); { channel_loop([=](size_t unroll) { // Works with 16c times @unroll blocks simultaneously. // Each block up converts 16c, performs math and down // converts. for (size_t i = 0; i < unroll; i++) { Vmm v = Vmm(i + 0*unroll); Vmm vscale = Vmm(i + 1*unroll); Vmm vshift = Vmm(i + 2*unroll); Vmm vmean = Vmm(i + 3*unroll); Vmm vsqrtvar = Vmm(i + 4*unroll); compute_vscaleshift(vscale, vshift, vmean, vsqrtvar, i * c_in_xmm_ * sizeof(float)); vpmovsxbd(v, src_ptr(i * c_in_xmm_)); vcvtdq2ps(v, v); uni_vfmadd213ps(v, vscale, vshift); if (with_relu_) uni_vmaxps(v, v, vzero); vcvtps2dq(v, v); vpmovsdb(dst_ptr(i * c_in_xmm_), v); } }, [=]() { // There is no way to get performance as one has to // work with bytes via xmm. vzeroupper kills the perf. Xmm x = Xmm(0); Vmm v = Vmm(0); Vmm vscale = Vmm(1); Vmm vshift = Vmm(2); Vmm vmean = Vmm(3); Vmm vsqrtvar = Vmm(4); for (size_t tl = 0; tl < c_tail_; tl++) vpinsrb(x, x, src_ptr(tl), tl); compute_vscaleshift(vscale, vshift, vmean, vsqrtvar, 0, true); vpmovsxbd(v, x); vcvtdq2ps(v, v); uni_vfmadd213ps(v, vscale, vshift); if (with_relu_) uni_vmaxps(v, v, vzero); vcvtps2dq(v, v); vpmovsdb(x, v); for (size_t tl = 0; tl < c_tail_; tl++) vpextrb(dst_ptr(tl), x, tl); }); add(reg_soff, reg_coff_max); cmp(reg_soff, reg_soff_max); jl(mb_sp_loop); } }