void dwf_dslash_5_plus_dag1(Vector *out, 
		       Vector *in, 
		       Float mass,
		       Dwf *dwf_lib_arg)
{

#pragma omp parallel default(shared)
  {
    // Initializations
    //------------------------------------------------------------------
    
    int idx;
    IFloat *f_in;
    IFloat *f_out;
    int x;
    int s;

    const IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv();
    const IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv();
    const int local_ls    = GJP.SnodeSites(); 
    const int s_nodes     = GJP.Snodes();
    const int s_node_coor = GJP.SnodeCoor();
    const int vol_4d_cb   = dwf_lib_arg->vol_4d / 2;
    const int max_dex((local_ls-1)*vol_4d_cb);
    const int ls_stride   = 24 * vol_4d_cb;
  
    IFloat *comm_buf   = dwf_lib_arg->comm_buf;
    
    // [1 + gamma_5] term (if dag=1 [1 - gamma_5] term)
    //
    // out[s] = [1 + gamma_5] in[s-1]
    //------------------------------------------------------------------        
    f_in  = (IFloat *) in;
    f_out = (IFloat *) out;
    //if(dag == 1){
      f_in  =  f_in + 12;
      f_out = f_out + 12;
      //}
    f_out = f_out + ls_stride; 
       
#pragma omp  for schedule(static)
    for (idx=0;idx<max_dex;idx++)
      {
	cblas_daxpy(12,two_over_a5,f_in+24*idx,f_out+24*idx);
      }
    
    // [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term)
    // If there's only one node along fifth direction, no communication
    // is necessary; Otherwise data from adjacent node in minus direction
    // will be needed.
    // If the lower boundary is the s=0 term
    // out[0] = - m_f * [1 + gamma_5] in[ls-1]
    // else, out[s] = [1 + gamma_5] in[s-1]
    //
    //------------------------------------------------------------------
 
    f_in  = (IFloat *) in;  
    f_in = f_in + (local_ls-1)*ls_stride; 
    f_out = (IFloat *) out;
    
    //if(dag == 1){
      f_in  =  f_in + 12;
      f_out = f_out + 12;
      //}

#pragma omp for schedule(static)
    for(x=0; x<vol_4d_cb; x++)
      {
	int shift(24*x);
	IFloat *f_temp(f_in+shift);
	
	if (s_nodes != 1 ) {
	  f_temp = comm_buf;
	  getMinusData(f_temp, f_in+shift, 12, 4);
	}
	
	if(s_node_coor == 0) {
	  cblas_daxpy(12,neg_mass_two_over_a5,f_temp,f_out+shift);
	}
	else {
	  cblas_daxpy(12,two_over_a5,f_temp,f_out+shift);
	}
      }
    
    
    // [1 - gamma_5] term (if dag=1 [1 + gamma_5] term)
    // 
    // out[s] = [1 - gamma_5] in[s+1]
    //------------------------------------------------------------------
    f_in  = (IFloat *) in;
    f_out = (IFloat *) out;
#if 0
    if(dag == 0){
      f_in  =  f_in + 12;
      f_out = f_out + 12;
    }
#endif
    f_in = f_in + ls_stride;
    
#pragma omp for schedule(static)
    for (idx=0;idx<max_dex;idx++)
      {
	const int shift(24*idx);
	cblas_daxpy(12,two_over_a5,f_in+shift,f_out+shift);
      }
    
    // [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term)
    // If there's only one node along fifth direction, no communication
    // is necessary; Otherwise data from adjacent node in minus direction
    // will be needed.
    // If the upper boundary is the s=ls term
    // out[ls-1] = - m_f * [1 - gamma_5] in[0]
    // else out[s] = [1 - gamma_5] in[s+1]
    //
    //------------------------------------------------------------------

    f_in  = (IFloat *) in;
    f_out = (IFloat *) out;
    
#if 0
    if(dag == 0){
      f_in  =  f_in + 12;
      f_out = f_out + 12;
    }
#endif    
    f_out = f_out + (local_ls-1)*ls_stride;

#pragma omp for schedule(static)
    for(x=0; x<vol_4d_cb; x++){
      const int shift(24*x);
      IFloat *f_temp (f_in+shift);
      
      if (s_nodes != 1 ) {
	f_temp = comm_buf;
	getPlusData(f_temp, f_in+shift, 12, 4);
      }
      
      if(s_node_coor == s_nodes - 1) { 
	cblas_daxpy(12,neg_mass_two_over_a5,f_temp,f_out+shift);
      }
      else {
	cblas_daxpy(12,two_over_a5,f_temp,f_out+shift);
      }
    }
  } // omp parallel

  const int local_ls    = GJP.SnodeSites(); 
  const int vol_4d_cb   = dwf_lib_arg->vol_4d / 2;

    DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12;
}
Beispiel #2
0
void pSubChain::calc_acc_body()
{
//	if(!children[1]) return;
#ifdef USE_MPI
	if(sim->rank != rank) return;
	if(children[0] && sim->rank != children[0]->rank)
	{
		children[0]->recv_acc();
	}
	if(children[1] && children[0] != children[1] && sim->rank != children[1]->rank)
	{
		children[1]->recv_acc();
	}
#endif
//	update_log << "--- " << last_joint->name << ": calc_acc_body" << endl;
	int i, j;
	// compute f_temp
	static fVec da;
	static fMat PK;
	PK.resize(6, n_dof);
	da.resize(n_const);
	for(i=0; i<6; i++)
	{
		for(j=0; j<n_dof; j++)
			PK(i, j) = P(i, joint_index[j]);
	}
	if(last_joint->n_dof > 0)
	{
		switch(last_joint->j_type)
		{
		case JROTATE:
		case JSLIDE:
			tau(0) = last_joint->tau;
			break;
		case JSPHERE:
			tau(0) = last_joint->tau_n(0);
			tau(1) = last_joint->tau_n(1);
			tau(2) = last_joint->tau_n(2);
			break;
		case JFREE:
			tau(0) = last_joint->tau_f(0);
			tau(1) = last_joint->tau_f(1);
			tau(2) = last_joint->tau_f(2);
			tau(3) = last_joint->tau_n(0);
			tau(4) = last_joint->tau_n(1);
			tau(5) = last_joint->tau_n(2);
			break;
		default:
			break;
		}
		da6.mul(PK, tau);
	}
	else
		da6.zero();
//	cerr << "da6(0) = " << tran(da6) << endl;
	// + child_side - parent_side ?
	da6 += children[0]->acc_temp[last_index[0]];
//	cerr << "da6(1) = " << tran(da6) << endl;
	if(children[1])
		da6 -= children[1]->acc_temp[last_index[1]];
//	cerr << "da6(2) = " << tran(da6) << endl;
	// motion controlled joints
	if(!last_joint->t_given)
	{
		switch(last_joint->j_type)
		{
		case JROTATE:
		case JSLIDE:
			da6(axis) -= last_joint->qdd;
//			update_log << last_joint->name << ": qdd = " << last_joint->qdd << endl;
			break;
		case JSPHERE:
			da6(3) -= last_joint->rel_ang_acc(0);
			da6(4) -= last_joint->rel_ang_acc(1);
			da6(5) -= last_joint->rel_ang_acc(2);
			break;
		case JFREE:
			da6(0) -= last_joint->rel_lin_acc(0);
			da6(1) -= last_joint->rel_lin_acc(1);
			da6(2) -= last_joint->rel_lin_acc(2);
			da6(3) -= last_joint->rel_ang_acc(0);
			da6(4) -= last_joint->rel_ang_acc(1);
			da6(5) -= last_joint->rel_ang_acc(2);
			break;
		}
	}
	static fVec f(6);
//	cerr << "Gamma = " << Gamma << endl;
//	cerr << "Gamma_inv = " << Gamma_inv << endl;
#if 0
	// actually we could save some computation by
	// selecting const rows first
	for(i=0; i<n_const; i++)
		da(i) = -da6(const_index[i]);
	f_temp.mul(Gamma_inv, da);
//	f_temp.lineq_posv(Gamma, da);
	// compute acc at all outer joints
	for(i=0; i<n_dof; i++)
		f(joint_index[i]) = tau(i);
	for(i=0; i<n_const; i++)
		f(const_index[i]) = f_temp(i);
//	cerr << "da = " << tran(da) << endl;
//	cerr << "f_temp = " << tran(f_temp) << endl;
//	cerr << "Gamma*f_temp - da = " << tran(Gamma*f_temp-da) << endl;
#else
#if 0
	f.mul(W, da6);
	for(i=0; i<n_dof; i++)
	{
		f(joint_index[i]) += tau(i);
	}
#else
	static fVec db(6), Wdb(6), IWRtau(6);
	static fMat IWR;
	IWR.resize(6, n_dof);
	db.set(children[0]->acc_temp[last_index[0]]);
	if(children[1])
		db -= children[1]->acc_temp[last_index[1]];
	Wdb.mul(W, db);
	for(i=0; i<6; i++)
	{
		for(j=0; j<n_dof; j++)
		{
			IWR(i, j) = IW(joint_index[j], i);
		}
	}
	IWRtau.mul(IWR, tau);
//	cerr << "W = " << tran(W) << endl;
//	update_log << "db = " << tran(db) << endl;
//	cerr << "Wdb = " << tran(Wdb) << endl;
//	cerr << "IWRtau = " << tran(IWRtau) << endl;
	f.add(Wdb, IWRtau);
//	update_log << "f = " << tran(f) << endl;
	
#ifdef PSIM_TEST
	////// -> test
	for(i=0; i<n_const; i++)
	{
		da(i) = -da6(const_index[i]);
		f_temp(i) = f(const_index[i]);
	}
//	cerr << "Gamma*f_temp - da = " << tran(Gamma*f_temp-da) << endl;
	total_gamma_error += (Gamma*f_temp-da) * (Gamma*f_temp-da);
	////// <-
#endif
#endif
#endif
	for(i=0; i<n_outer_joints; i++)
	{
		int org = outer_joints_origin[i];
		int index = outer_joints_index[i];
		int ilast = last_index[org];
		acc_temp[i].mul(children[org]->Lambda[index][ilast], f);
		if(org == 1)
		{
			acc_temp[i] *= -1.0;
		}
		acc_temp[i] += children[org]->acc_temp[index];
//		update_log << "acc_temp[" << i << "] = " << tran(acc_temp[i]) << endl;
	}
#ifdef USE_MPI
	if(parent && sim->rank != parent->rank)
	{
		send_acc(parent->rank);
	}
#endif
}