示例#1
0
static int
q_dirac_solver(lua_State *L)
{
    CloverSolver *solver = lua_touserdata(L, lua_upvalueindex(1));
    mClover *c = qlua_checkClover(L, lua_upvalueindex(2), NULL, 1);
    mLattice *S = qlua_ObjLattice(L, lua_upvalueindex(2));
    int Sidx = lua_gettop(L);
    int relaxed_p;
    long long fl1;
    double t1;
    double out_eps;
    int out_iters;
    int log_level;
    
    switch (lua_type(L, 2)) {
    case LUA_TNONE:
    case LUA_TNIL:
        relaxed_p = 0;
        break;
    case LUA_TBOOLEAN:
        relaxed_p = lua_toboolean(L, 2);
        break;
    default:
        relaxed_p = 1;
        break;
    }

    if ((lua_type(L, 3) == LUA_TBOOLEAN) && (lua_toboolean(L, 3) != 0))
        log_level = (QOP_CLOVER_LOG_CG_RESIDUAL |
                     QOP_CLOVER_LOG_EIG_POSTAMBLE |
                     QOP_CLOVER_LOG_EIG_UPDATE1);
    else
        log_level = 0;

    switch (qlua_qtype(L, 1)) {
    case qLatDirFerm3: {
        mLatDirFerm3 *psi = qlua_checkLatDirFerm3(L, 1, S, 3);
        mLatDirFerm3 *eta = qlua_newZeroLatDirFerm3(L, Sidx, 3);
        struct QOP_CLOVER_Fermion *c_psi;
        struct QOP_CLOVER_Fermion *c_eta;
        CL_D_env env;
        QLA_D_Real rhs_norm2 = 0;
        double rhs_n;
        int status;

        CALL_QDP(L);
        QDP_D3_r_eq_norm2_D(&rhs_norm2, psi->ptr, S->all);
        if (rhs_norm2 == 0) {
            lua_pushnumber(L, 0.0);
            lua_pushnumber(L, 0);
            lua_pushnumber(L, 0);
            lua_pushnumber(L, 0);
            return 5;
        }
        rhs_n = sqrt(rhs_norm2);
        env.lat = S->lat;
        env.f = QDP_D3_expose_D(psi->ptr);
        env.s = 1 / rhs_n;
        if (QOP_CLOVER_import_fermion(&c_psi, c->state, q_CL_D_reader_scaled,
                                      &env))
            return luaL_error(L, "CLOVER_import_fermion() failed");
        QDP_D3_reset_D(psi->ptr);

        if (QOP_CLOVER_allocate_fermion(&c_eta, c->state))
            return luaL_error(L, "CLOVER_allocate_fermion() failed");

        status = solver->proc(L, c_eta, &out_iters, &out_eps, c_psi, log_level);

        QOP_CLOVER_performance(&t1, &fl1, NULL, NULL, c->state);
        if (t1 == 0)
            t1 = -1;
        if (QDP_this_node == qlua_master_node)
            printf("CLOVER %s solver: status = %d,"
                   " eps = %.4e, iters = %d, time = %.3f sec,"
                   " perf = %.2f MFlops/sec\n",
                   solver->name, status,
                   out_eps, out_iters, t1, fl1 * 1e-6 / t1);

        env.lat = S->lat;
        env.f = QDP_D3_expose_D(eta->ptr);
        env.s = rhs_n;
        QOP_CLOVER_export_fermion(q_CL_D_writer_scaled, &env, c_eta);
        QDP_D3_reset_D(eta->ptr);
        
        QOP_CLOVER_free_fermion(&c_eta);
        QOP_CLOVER_free_fermion(&c_psi);

        if (status) {
            if (relaxed_p)
                return 0;
            else
                return luaL_error(L, QOP_CLOVER_error(c->state));
        }

        /* eta is on the stack already */
        lua_pushnumber(L, out_eps);
        lua_pushnumber(L, out_iters);
        lua_pushnumber(L, t1);
        lua_pushnumber(L, (double)fl1);

        return 5;
    }

    case qLatDirProp3: {
        mLatDirProp3 *psi = qlua_checkLatDirProp3(L, 1, S, 3);
        mLatDirProp3 *eta = qlua_newZeroLatDirProp3(L, Sidx, 3);
        struct QOP_CLOVER_Fermion *c_psi;
        struct QOP_CLOVER_Fermion *c_eta;
        int gstatus = 0;
        int status;
        qCL_P_env env;
        QLA_D_Real rhs_norm2 = 0;
        double rhs_n;

        lua_createtable(L, QOP_CLOVER_COLORS, 0);  /* eps */
        lua_createtable(L, QOP_CLOVER_COLORS, 0);  /* iters */
        CALL_QDP(L);
        if (QOP_CLOVER_allocate_fermion(&c_eta, c->state))
            return luaL_error(L, "CLOVER_allocate_fermion() failed");

        QDP_D3_r_eq_norm2_P(&rhs_norm2, psi->ptr, S->all);
        if (rhs_norm2 == 0) {
            return 3;
        }
        rhs_n = sqrt(rhs_norm2);
        env.lat = S->lat;
        env.in = QDP_D3_expose_P(psi->ptr);
        env.out = QDP_D3_expose_P(eta->ptr);
        for (env.c = 0; env.c < QOP_CLOVER_COLORS; env.c++) {
            lua_createtable(L, QOP_CLOVER_FERMION_DIM, 0); /* eps.c */
            lua_createtable(L, QOP_CLOVER_FERMION_DIM, 0); /* iters.c */

            for (env.d = 0; env.d < QOP_CLOVER_FERMION_DIM; env.d++) {
                env.s = 1 / rhs_n;
                if (QOP_CLOVER_import_fermion(&c_psi, c->state,
                                              q_CL_P_reader_scaled, &env))
                    return luaL_error(L, "CLOVER_import_fermion() failed");
                status = solver->proc(L, c_eta, &out_iters, &out_eps, c_psi,
                                      log_level);

                QOP_CLOVER_performance(&t1, &fl1, NULL, NULL, c->state);
                if (t1 == 0)
                    t1 = -1;
                if (QDP_this_node == qlua_master_node)
                    printf("CLOVER %s solver: status = %d, c = %d, d = %d,"
                           " eps = %.4e, iters = %d, time = %.3f sec,"
                           " perf = %.2f MFlops/sec\n",
                           solver->name, status,
                           env.c, env.d, out_eps, out_iters, t1,
                           fl1 * 1e-6 / t1);
                QOP_CLOVER_free_fermion(&c_psi);
                if (status) {
                    if (relaxed_p)
                        gstatus = 1;
                    else
                        return luaL_error(L, QOP_CLOVER_error(c->state));
                }

                env.s = rhs_n;
                QOP_CLOVER_export_fermion(q_CL_P_writer_scaled, &env, c_eta);
                lua_pushnumber(L, out_eps);
                lua_rawseti(L, -3, env.d + 1);
                lua_pushnumber(L, out_iters);
                lua_rawseti(L, -2, env.d + 1);
            }
            lua_rawseti(L, -3, env.c + 1);
            lua_rawseti(L, -3, env.c + 1);
        }
        QDP_D3_reset_P(psi->ptr);
        QDP_D3_reset_P(eta->ptr);
        QOP_CLOVER_free_fermion(&c_eta);

        if (gstatus)
            return 0;
        else
            return 3;
    }
    default:
        break;
    }
    return luaL_error(L, "bad argument to CLOVER solver");
}
示例#2
0
int 
bicgilu_cl_qop_single_for_double( int prop_type,
				  QOP_FermionLinksWilson *qop_links, 
				  quark_invert_control *qic, int milc_parity,
				  void *dmps[],
				  float *kappas[], int nkappa[], 
				  QOP_DiracFermion **qop_sol[], 
				  QOP_DiracFermion *qop_src[], 
				  int nsrc,		    
				  int *final_restart,
				  Real *final_rsq_ptr )
{
  int i, iters, iters_F = 0;
  int converged;
  int nrestart;
  int max_restarts = qic->nrestart;
  int isrc, ikappa;
  int final_restart_F;
  Real final_rsq_F, final_relrsq_F;
  Real resid_F = 3e-7;   /* The limits of a single precision inversion */
  Real rel_F = 0;   /* The limits of a single precision inversion */
  QOP_invert_arg_t qop_invert_arg;
  QOP_resid_arg_t  ***qop_resid_arg_F;
  QOP_info_t info_F = {0., 0., 0, 0, 0}, info = {0., 0., 0, 0, 0};
  QDP_Subset subset = milc2qdp_subset(milc_parity);
  QOP_F3_FermionLinksWilson *qop_links_F;
  QOP_F3_DiracFermion **qop_sol_F[MAXSRC], *qop_rhs_F[MAXSRC];
  QDP_F3_DiracFermion *qdp_rhs_F[MAXSRC];
  QDP_D3_DiracFermion *qdp_src[MAXSRC], *qdp_resid[MAXSRC];
  QDP_D3_DiracFermion *qdp_sol;
  Real relresid2[MAXSRC];
  Real resid2[MAXSRC];
  QLA_D_Real norm2_src[MAXSRC], norm2_resid[MAXSRC], norm_resid[MAXSRC], scale_resid;
  char myname[] = "bicgilu_cl_qop_single_for_double";
  
  /* Only one kappa allowed per source for this algorithm */
  for(i = 0; i < nsrc; i++){
    if(nkappa[i] > 1){
      printf("%s: nkappa[%d] = %d != 1\n",myname,i,nkappa[i]);
      terminate(1);
    }
  }
  
  /* Set qop_invert_arg */
  /* We don't do restarts for the single precision step */
  /* We interpret "qic->nrestart" to mean the max number of calls to
     the single-precision inverter */
  set_qop_invert_arg_norestart( & qop_invert_arg, qic, milc_parity );
  
  /* Pointers for residual errors */
  /* For now we set the residual to something sensible for single precision */
  qop_resid_arg_F = create_qop_resid_arg( nsrc, nkappa, resid_F*resid_F, rel_F*rel_F);

  /* Create a single precision copy of the links object */
  qop_links_F = QOP_FD3_wilson_create_L_from_L( qop_links );

  /* Take norm of source and create temporaries */

  for(i = 0; i < nsrc; i++){
    qdp_src[i] = QOP_D3_convert_D_to_qdp( qop_src[i] );
    QDP_D3_r_eq_norm2_D( norm2_src+i, qdp_src[i], subset );
    qdp_resid[i] = QDP_D3_create_D();
    qdp_rhs_F[i] = QDP_F3_create_D();
    qop_sol_F[i] = (QOP_F3_DiracFermion **)malloc(sizeof(QOP_F3_DiracFermion *));
  }


  /* Main loop */

  nrestart = 0;
  converged = 0;
  iters = 0;

  info.final_sec = -dclock();
  info.final_flop = 0;
  info.status = QOP_SUCCESS;

  while(1){
    /* Create new residual vectors from the result */
    /* r = src - A sol */
    compute_qdp_residuals( prop_type, qdp_resid, qdp_src, 
			   qop_links, qop_sol, dmps, kappas, 
			   nkappa, nsrc, milc_parity );

    /* Compute two different norms */
    qic->final_rsq = 0;
    qic->final_relrsq = 0;
    for(i = 0; i < nsrc; i++){
      qdp_sol = QOP_convert_D_to_qdp( qop_sol[i][0] );
      relresid2[i] = qdp_relative_residue( qdp_resid[i], qdp_sol, subset );
      qop_sol[i][0] = QOP_convert_D_from_qdp( qdp_sol );
      qic->final_relrsq = (relresid2[i] > qic->final_relrsq) ? relresid2[i] : qic->final_relrsq;

      QDP_D3_r_eq_norm2_D( norm2_resid+i, qdp_resid[i], subset );
      resid2[i] = norm2_resid[i]/norm2_src[i];
      qic->final_rsq = (resid2[i] > qic->final_rsq) ? resid2[i] : qic->final_rsq;
#ifdef CG_DEBUG
      node0_printf("%s: double precision restart %d resid2 = %.2e vs %.2e relresid2 = %.2e vs %.2e\n",
		   myname, nrestart, resid2[i], qic->resid * qic->resid, relresid2[i], 
		   qic->relresid * qic->relresid );
#endif
    }
    *final_rsq_ptr = qic->final_rsq;  /* Use Cartesian norm for now */
    *final_restart = nrestart;

    /* Stop when converged */
    converged = 1;
    for(i = 0; i < nsrc; i++){
      if((qic->resid > 0 && resid2[i] > qic->resid * qic->resid) || 
	 (qic->relresid > 0 && relresid2[i] > qic->relresid * qic->relresid)){
	converged = 0;
	break;
      }
    }

    if(converged || nrestart++>=max_restarts)break;

    for(i = 0; i < nsrc; i++){
      /* Scale the RHS to avoid underflow */
      norm_resid[i] = sqrt(norm2_resid[i]);
      scale_resid = 1./norm_resid[i];
      QDP_D3_D_eq_r_times_D(qdp_resid[i], &scale_resid, qdp_resid[i], subset);
      /* Scaled residual becomes the new source */
      QDP_FD3_D_eq_D( qdp_rhs_F[i], qdp_resid[i],  subset);
      qop_rhs_F[i] = QOP_F3_convert_D_from_qdp( qdp_rhs_F[i]);
      /* Prepare to solve in single precision by creating a single
	 precision copy of the source.  Set the trial solution to zero. */
      qop_sol_F[i][0] = create_qop_DiracFermion_F();
    }


    /* Solve in single precision */
    double dtime = -dclock();
    info_F.final_flop = 0.;
    bicgilu_cl_qop_generic_F( prop_type, &info_F, qop_links_F, 
	  &qop_invert_arg, qop_resid_arg_F, dmps, nkappa, qop_sol_F, 
  	  qop_rhs_F, nsrc);
    dtime += dclock();

    /* Report performance statistics */
    
    /* For now we return the largest value and total iterations */
    final_rsq_F = 0;
    final_relrsq_F = 0;
    final_restart_F = 0;
    iters_F = 0;
    for(isrc = 0; isrc < nsrc; isrc++)
      for(ikappa = 0; ikappa < nkappa[isrc]; ikappa++){
	/* QOP routines return the ratios of the squared norms */
	final_rsq_F =    MAX(final_rsq_F, qop_resid_arg_F[isrc][ikappa]->final_rsq);
	final_relrsq_F = MAX(final_relrsq_F, qop_resid_arg_F[isrc][ikappa]->final_rel);
	final_restart_F =    MAX(final_restart_F,  qop_resid_arg_F[isrc][ikappa]->final_restart);
	iters_F += qop_resid_arg_F[isrc][ikappa]->final_iter;
	if(nsrc > 1 || nkappa[isrc] > 1)
	  node0_printf("BICG(src %d,kappa %d): iters = %d resid = %e relresid = %e\n",
		       isrc, ikappa,
		       qop_resid_arg_F[isrc][ikappa]->final_iter,
		       sqrt(qop_resid_arg_F[isrc][ikappa]->final_rsq),
		       sqrt(qop_resid_arg_F[isrc][ikappa]->final_rel));
      }
    
#ifdef CGTIME
    node0_printf("%s: single precision iters = %d status %d final_rsq %.2e wanted %2e final_rel %.2e wanted %.2e\n",
		 myname, iters_F, info_F.status, final_rsq_F, resid_F * resid_F, final_relrsq_F, rel_F);
    node0_printf("time = %g flops = %e mflops = %g\n", dtime, info_F.final_flop, 
		 info_F.final_flop/(1.0e6*dtime) );
    fflush(stdout);
#endif

    /* Add single-precision result to double precision solution (with rescaling) */
    update_qop_solution( qop_sol, norm_resid, qop_sol_F, nsrc, subset );

    for(i = 0; i < nsrc; i++){
      QOP_F3_destroy_D(qop_sol_F[i][0]);
      /* Convert back */
      qdp_rhs_F[i] = QOP_F3_convert_D_to_qdp(qop_rhs_F[i]);
    }

    info.final_flop += info_F.final_flop;
    iters += iters_F;
  }

  /* Clean up */

  for(i = 0; i < nsrc; i++){
    QDP_F3_destroy_D( qdp_rhs_F[i] );
    QDP_D3_destroy_D( qdp_resid[i] );
    /* Must restore qop_src in case the caller reuses it */
    qop_src[i] = QOP_D3_convert_D_from_qdp( qdp_src[i] );
    free(qop_sol_F[i]);
  }

  QOP_F3_wilson_destroy_L( qop_links_F );
  destroy_qop_resid_arg(qop_resid_arg_F, nsrc, nkappa);
  qop_resid_arg_F = NULL;

  if(!converged){
    node0_printf("%s: NOT Converged after %d iters and %d restarts\n",
		 myname, iters, nrestart);
  }

  info.final_sec += dclock();
#ifdef CGTIME
  node0_printf("CGTIME: time = %e (wilson_qop FD) ", info.final_sec);
  for(isrc = 0; isrc < nsrc; isrc++)
    node0_printf("nkappa[%d] = %d tot_iters = %d ",
		 isrc,nkappa[isrc],iters);
  node0_printf("mflops = %e\n", info.final_flop/(1.0e6*info.final_sec) );
  fflush(stdout);
#endif

  return iters;
}