void dwf_dslash_5_plus_dag1(Vector *out, Vector *in, Float mass, Dwf *dwf_lib_arg) { #pragma omp parallel default(shared) { // Initializations //------------------------------------------------------------------ int idx; IFloat *f_in; IFloat *f_out; int x; int s; const IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv(); const IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv(); const int local_ls = GJP.SnodeSites(); const int s_nodes = GJP.Snodes(); const int s_node_coor = GJP.SnodeCoor(); const int vol_4d_cb = dwf_lib_arg->vol_4d / 2; const int max_dex((local_ls-1)*vol_4d_cb); const int ls_stride = 24 * vol_4d_cb; IFloat *comm_buf = dwf_lib_arg->comm_buf; // [1 + gamma_5] term (if dag=1 [1 - gamma_5] term) // // out[s] = [1 + gamma_5] in[s-1] //------------------------------------------------------------------ f_in = (IFloat *) in; f_out = (IFloat *) out; //if(dag == 1){ f_in = f_in + 12; f_out = f_out + 12; //} f_out = f_out + ls_stride; #pragma omp for schedule(static) for (idx=0;idx<max_dex;idx++) { cblas_daxpy(12,two_over_a5,f_in+24*idx,f_out+24*idx); } // [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the lower boundary is the s=0 term // out[0] = - m_f * [1 + gamma_5] in[ls-1] // else, out[s] = [1 + gamma_5] in[s-1] // //------------------------------------------------------------------ f_in = (IFloat *) in; f_in = f_in + (local_ls-1)*ls_stride; f_out = (IFloat *) out; //if(dag == 1){ f_in = f_in + 12; f_out = f_out + 12; //} #pragma omp for schedule(static) for(x=0; x<vol_4d_cb; x++) { int shift(24*x); IFloat *f_temp(f_in+shift); if (s_nodes != 1 ) { f_temp = comm_buf; getMinusData(f_temp, f_in+shift, 12, 4); } if(s_node_coor == 0) { cblas_daxpy(12,neg_mass_two_over_a5,f_temp,f_out+shift); } else { cblas_daxpy(12,two_over_a5,f_temp,f_out+shift); } } // [1 - gamma_5] term (if dag=1 [1 + gamma_5] term) // // out[s] = [1 - gamma_5] in[s+1] //------------------------------------------------------------------ f_in = (IFloat *) in; f_out = (IFloat *) out; #if 0 if(dag == 0){ f_in = f_in + 12; f_out = f_out + 12; } #endif f_in = f_in + ls_stride; #pragma omp for schedule(static) for (idx=0;idx<max_dex;idx++) { const int shift(24*idx); cblas_daxpy(12,two_over_a5,f_in+shift,f_out+shift); } // [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the upper boundary is the s=ls term // out[ls-1] = - m_f * [1 - gamma_5] in[0] // else out[s] = [1 - gamma_5] in[s+1] // //------------------------------------------------------------------ f_in = (IFloat *) in; f_out = (IFloat *) out; #if 0 if(dag == 0){ f_in = f_in + 12; f_out = f_out + 12; } #endif f_out = f_out + (local_ls-1)*ls_stride; #pragma omp for schedule(static) for(x=0; x<vol_4d_cb; x++){ const int shift(24*x); IFloat *f_temp (f_in+shift); if (s_nodes != 1 ) { f_temp = comm_buf; getPlusData(f_temp, f_in+shift, 12, 4); } if(s_node_coor == s_nodes - 1) { cblas_daxpy(12,neg_mass_two_over_a5,f_temp,f_out+shift); } else { cblas_daxpy(12,two_over_a5,f_temp,f_out+shift); } } } // omp parallel const int local_ls = GJP.SnodeSites(); const int vol_4d_cb = dwf_lib_arg->vol_4d / 2; DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12; }
void pSubChain::calc_acc_body() { // if(!children[1]) return; #ifdef USE_MPI if(sim->rank != rank) return; if(children[0] && sim->rank != children[0]->rank) { children[0]->recv_acc(); } if(children[1] && children[0] != children[1] && sim->rank != children[1]->rank) { children[1]->recv_acc(); } #endif // update_log << "--- " << last_joint->name << ": calc_acc_body" << endl; int i, j; // compute f_temp static fVec da; static fMat PK; PK.resize(6, n_dof); da.resize(n_const); for(i=0; i<6; i++) { for(j=0; j<n_dof; j++) PK(i, j) = P(i, joint_index[j]); } if(last_joint->n_dof > 0) { switch(last_joint->j_type) { case JROTATE: case JSLIDE: tau(0) = last_joint->tau; break; case JSPHERE: tau(0) = last_joint->tau_n(0); tau(1) = last_joint->tau_n(1); tau(2) = last_joint->tau_n(2); break; case JFREE: tau(0) = last_joint->tau_f(0); tau(1) = last_joint->tau_f(1); tau(2) = last_joint->tau_f(2); tau(3) = last_joint->tau_n(0); tau(4) = last_joint->tau_n(1); tau(5) = last_joint->tau_n(2); break; default: break; } da6.mul(PK, tau); } else da6.zero(); // cerr << "da6(0) = " << tran(da6) << endl; // + child_side - parent_side ? da6 += children[0]->acc_temp[last_index[0]]; // cerr << "da6(1) = " << tran(da6) << endl; if(children[1]) da6 -= children[1]->acc_temp[last_index[1]]; // cerr << "da6(2) = " << tran(da6) << endl; // motion controlled joints if(!last_joint->t_given) { switch(last_joint->j_type) { case JROTATE: case JSLIDE: da6(axis) -= last_joint->qdd; // update_log << last_joint->name << ": qdd = " << last_joint->qdd << endl; break; case JSPHERE: da6(3) -= last_joint->rel_ang_acc(0); da6(4) -= last_joint->rel_ang_acc(1); da6(5) -= last_joint->rel_ang_acc(2); break; case JFREE: da6(0) -= last_joint->rel_lin_acc(0); da6(1) -= last_joint->rel_lin_acc(1); da6(2) -= last_joint->rel_lin_acc(2); da6(3) -= last_joint->rel_ang_acc(0); da6(4) -= last_joint->rel_ang_acc(1); da6(5) -= last_joint->rel_ang_acc(2); break; } } static fVec f(6); // cerr << "Gamma = " << Gamma << endl; // cerr << "Gamma_inv = " << Gamma_inv << endl; #if 0 // actually we could save some computation by // selecting const rows first for(i=0; i<n_const; i++) da(i) = -da6(const_index[i]); f_temp.mul(Gamma_inv, da); // f_temp.lineq_posv(Gamma, da); // compute acc at all outer joints for(i=0; i<n_dof; i++) f(joint_index[i]) = tau(i); for(i=0; i<n_const; i++) f(const_index[i]) = f_temp(i); // cerr << "da = " << tran(da) << endl; // cerr << "f_temp = " << tran(f_temp) << endl; // cerr << "Gamma*f_temp - da = " << tran(Gamma*f_temp-da) << endl; #else #if 0 f.mul(W, da6); for(i=0; i<n_dof; i++) { f(joint_index[i]) += tau(i); } #else static fVec db(6), Wdb(6), IWRtau(6); static fMat IWR; IWR.resize(6, n_dof); db.set(children[0]->acc_temp[last_index[0]]); if(children[1]) db -= children[1]->acc_temp[last_index[1]]; Wdb.mul(W, db); for(i=0; i<6; i++) { for(j=0; j<n_dof; j++) { IWR(i, j) = IW(joint_index[j], i); } } IWRtau.mul(IWR, tau); // cerr << "W = " << tran(W) << endl; // update_log << "db = " << tran(db) << endl; // cerr << "Wdb = " << tran(Wdb) << endl; // cerr << "IWRtau = " << tran(IWRtau) << endl; f.add(Wdb, IWRtau); // update_log << "f = " << tran(f) << endl; #ifdef PSIM_TEST ////// -> test for(i=0; i<n_const; i++) { da(i) = -da6(const_index[i]); f_temp(i) = f(const_index[i]); } // cerr << "Gamma*f_temp - da = " << tran(Gamma*f_temp-da) << endl; total_gamma_error += (Gamma*f_temp-da) * (Gamma*f_temp-da); ////// <- #endif #endif #endif for(i=0; i<n_outer_joints; i++) { int org = outer_joints_origin[i]; int index = outer_joints_index[i]; int ilast = last_index[org]; acc_temp[i].mul(children[org]->Lambda[index][ilast], f); if(org == 1) { acc_temp[i] *= -1.0; } acc_temp[i] += children[org]->acc_temp[index]; // update_log << "acc_temp[" << i << "] = " << tran(acc_temp[i]) << endl; } #ifdef USE_MPI if(parent && sim->rank != parent->rank) { send_acc(parent->rank); } #endif }