Exemple #1
0
void tex::show_glue(ptr p)
	{
	if (tex::subtype(p) >= A_LEADERS) {
		show_leaders(p);
	} else {
		print_esc("glue");
		if (tex::subtype(p) != NORMAL)
			show_glue_type(tex::subtype(p));
		if (tex::subtype(p) != COND_MATH_GLUE) {
			print(" ");
			if (tex::subtype(p) < COND_MATH_GLUE)
				print_spec(glue_ptr(p), null_str);
			else print_spec(glue_ptr(p), "mu");
		}
	}
}
Exemple #2
0
void tex::show_leaders(ptr p)
	{
	print_esc(null_str);
	if (tex::subtype(p) == C_LEADERS)
		print("c");
	else if (tex::subtype(p) == X_LEADERS)
		print("x");
	print("leaders ");
	print_spec(glue_ptr(p), null_str);
	node_list_display(leader_ptr(p));
}
Exemple #3
0
void tex::show_insertion(ptr p)
	{
	print_esc("insert");
	print_int(tex::subtype(p));
	print(", natural size ");
	print_scaled(ins_height(p));
	print("; split(");
	print_spec(split_top_ptr(p), null_str);
	print(",");
	print_scaled(ins_depth(p));
	print("); float cost ");
	print_int(float_cost(p));
	node_list_display(ins_ptr(p));
	}
Exemple #4
0
static int
print_encode(struct html *h, const char *p, int norecurse)
{
	size_t		 sz;
	int		 len, nospace;
	const char	*seq;
	enum roffdeco	 deco;
	static const char rejs[6] = { '\\', '<', '>', '&', ASCII_HYPH, '\0' };

	nospace = 0;

	for (; *p; p++) {
		sz = strcspn(p, rejs);

		fwrite(p, 1, sz, stdout);
		p += /* LINTED */
			sz;

		if ('<' == *p) {
			printf("&lt;");
			continue;
		} else if ('>' == *p) {
			printf("&gt;");
			continue;
		} else if ('&' == *p) {
			printf("&amp;");
			continue;
		} else if (ASCII_HYPH == *p) {
			/*
			 * Note: "soft hyphens" aren't graphically
			 * displayed when not breaking the text; we want
			 * them to be displayed.
			 */
			/*printf("&#173;");*/
			putchar('-');
			continue;
		} else if ('\0' == *p)
			break;

		seq = ++p;
		len = a2roffdeco(&deco, &seq, &sz);

		switch (deco) {
		case (DECO_RESERVED):
			print_res(h, seq, sz);
			break;
		case (DECO_SSPECIAL):
			/* FALLTHROUGH */
		case (DECO_SPECIAL):
			print_spec(h, deco, seq, sz);
			break;
		case (DECO_PREVIOUS):
			/* FALLTHROUGH */
		case (DECO_BOLD):
			/* FALLTHROUGH */
		case (DECO_ITALIC):
			/* FALLTHROUGH */
		case (DECO_ROMAN):
			if (norecurse)
				break;
			print_metaf(h, deco);
			break;
		default:
			break;
		}

		p += len - 1;

		if (DECO_NOSPACE == deco && '\0' == *(p + 1))
			nospace = 1;
	}

	return(nospace);
}
/*
* Main function. implements raw_ethernet_send_lat
*/
int main(int argc, char *argv[])
{

	struct ibv_device		*ib_dev = NULL;
	struct pingpong_context		ctx;
	struct raw_ethernet_info	my_dest_info,rem_dest_info;
	int				ret_parser;
	struct perftest_parameters	user_param;
#ifdef HAVE_RAW_ETH_EXP
	struct ibv_exp_flow		*flow_create_result = NULL;
	struct ibv_exp_flow_attr	*flow_rules = NULL;
	struct ibv_exp_flow 		*flow_promisc = NULL;
#else
	struct ibv_flow		*flow_create_result = NULL;
	struct ibv_flow_attr	*flow_rules = NULL;
#endif
	struct report_options		report;

	//allocate memory space for user parameters
	memset(&ctx,		0, sizeof(struct pingpong_context));
	memset(&user_param, 0, sizeof(struct perftest_parameters));
	memset(&my_dest_info, 0 , sizeof(struct raw_ethernet_info));
	memset(&rem_dest_info, 0 , sizeof(struct raw_ethernet_info));

	/* init default values to user's parameters that's relvant for this test:
	* Raw Ethernet Send Latency Test
	*/
	user_param.verb    = SEND;
	user_param.tst     = LAT;
	strncpy(user_param.version, VERSION, sizeof(user_param.version));
	user_param.connection_type = RawEth;
	user_param.r_flag  = &report;

	if (check_flow_steering_support()) {
            return 1;
        }

	/* Configure the parameters values according to user
												arguments or default values. */
	ret_parser = parser(&user_param, argv,argc);

	//check for parsing errors
	if (ret_parser) {
		if (ret_parser != VERSION_EXIT && ret_parser != HELP_EXIT)
			fprintf(stderr," Parser function exited with Error\n");
		DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
		return 1;
	}

	//this is a bidirectional test, so we need to let the init functions
	//to think we are in duplex mode
	//TODO: ask Ido if that's ok, or should I add another field in user_param
	user_param.duplex  = 1;


	// Find the selected IB device (or default if the user didn't select one).
	ib_dev = ctx_find_dev(user_param.ib_devname);
	if (!ib_dev) {
		fprintf(stderr," Unable to find the Infiniband/RoCE device\n");
		DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
		return 1;
	}

	// Getting the relevant context from the device
	ctx.context = ibv_open_device(ib_dev);
	if (!ctx.context) {
		fprintf(stderr, " Couldn't get context for the device\n");
		DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
		return 1;
	}

	// See if MTU and link type are valid and supported.
	if (check_link_and_mtu(ctx.context, &user_param)) {
		fprintf(stderr, " Couldn't get context for the device\n");
		DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
		return FAILURE;
	}

	// Allocating arrays needed for the test.
	alloc_ctx(&ctx, &user_param);

	// Print basic test information.
	ctx_print_test_info(&user_param);

	//set up the connection, return the required flow rules (notice that user_param->duplex == TRUE)
	//so the function will setup like it's a bidirectional test
	if (send_set_up_connection(&flow_rules, &ctx, &user_param, &my_dest_info, &rem_dest_info)) {
		fprintf(stderr," Unable to set up socket connection\n");
		return 1;
	}

	//print specifications of the test
	print_spec(flow_rules,&user_param);

	// Create (if necessary) the rdma_cm ids and channel.
	if (user_param.work_rdma_cm == ON) {

		//create resources
		if (create_rdma_resources(&ctx, &user_param)) {
			fprintf(stderr," Unable to create the rdma_resources\n");
			return FAILURE;
		}

		if (user_param.machine == CLIENT) {

			//Connects the client to a QP on the other machine with rdma_cm
			if (rdma_client_connect(&ctx, &user_param)) {
				fprintf(stderr,"Unable to perform rdma_client function\n");
				return FAILURE;
			}

		} else if (rdma_server_connect(&ctx, &user_param)) {
			//Assigning a server to listen on rdma_cm port and connect to it.
			fprintf(stderr,"Unable to perform rdma_server function\n");
			return FAILURE;
		}

	} else {

		// initalize IB resources (data buffer, PD, MR, CQ and events channel)
		if (ctx_init(&ctx, &user_param)) {
			fprintf(stderr, " Couldn't create IB resources\n");
			return FAILURE;
		}
	}


	//attaching the qp to the spec
#ifdef HAVE_RAW_ETH_EXP
	flow_create_result = ibv_exp_create_flow(ctx.qp[0], flow_rules);
#else
	flow_create_result = ibv_create_flow(ctx.qp[0], flow_rules);
#endif
	if (!flow_create_result){
		perror("error");
		fprintf(stderr, "Couldn't attach QP\n");
		return FAILURE;
	}

#ifdef HAVE_RAW_ETH_EXP
	if (user_param.use_promiscuous) {
		struct ibv_exp_flow_attr attr = {
			.type = IBV_EXP_FLOW_ATTR_ALL_DEFAULT,
			.num_of_specs = 0,
			.port = user_param.ib_port,
			.flags = 0
		};

		if ((flow_promisc = ibv_exp_create_flow(ctx.qp[0], &attr)) == NULL) {
			perror("error");
			fprintf(stderr, "Couldn't attach promiscous rule QP\n");
		}
	}
#endif

	//build ONE Raw Ethernet packets on ctx buffer
	create_raw_eth_pkt(&user_param,&ctx, &my_dest_info , &rem_dest_info);

	if (user_param.output == FULL_VERBOSITY) {
                printf(RESULT_LINE);
                printf("%s",(user_param.test_type == ITERATIONS) ? RESULT_FMT_LAT : RESULT_FMT_LAT_DUR);
                printf((user_param.cpu_util_data.enable ? RESULT_EXT_CPU_UTIL : RESULT_EXT));
        }

	// Prepare IB resources for rtr(ready to read)/rts(ready to send)
	if (user_param.work_rdma_cm == OFF) {
		if (ctx_connect(&ctx, NULL, &user_param, NULL)) {
			fprintf(stderr," Unable to Connect the HCA's through the link\n");
			DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
			return 1;
		}
	}


	//Post Send send_wqes for current message size
	ctx_set_send_wqes(&ctx,&user_param,NULL);

	// Post receive recv_wqes for current message size
	if (ctx_set_recv_wqes(&ctx,&user_param)) {
		fprintf(stderr," Failed to post receive recv_wqes\n");
		return 1;
	}

	//latency test function for SEND verb latency test.
	if (run_iter_lat_send(&ctx, &user_param))
	{
		return 17;
	}

	//print report (like print_report_bw) in the correct format
	// (as set before: FMT_LAT or FMT_LAT_DUR)
	user_param.test_type == ITERATIONS ? print_report_lat(&user_param) :
										print_report_lat_duration(&user_param);

	//destory promisc flow
#ifdef HAVE_RAW_ETH_EXP
	if (user_param.use_promiscuous) {
		if (ibv_exp_destroy_flow(flow_promisc)) {
			perror("error");
			fprintf(stderr, "Couldn't Destory promisc flow\n");
			return FAILURE;
		}
	}
#endif


	//destroy flow
#ifdef HAVE_RAW_ETH_EXP
	if (ibv_exp_destroy_flow(flow_create_result)) {
#else
	if (ibv_destroy_flow(flow_create_result)) {
#endif
		perror("error");
		fprintf(stderr, "Couldn't Destory flow\n");
		return FAILURE;
	}
	free(flow_rules);



	//Deallocate all perftest resources.
	if (destroy_ctx(&ctx, &user_param)) {
		fprintf(stderr,"Failed to destroy_ctx\n");
		DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
		return 1;
	}

	if (user_param.output == FULL_VERBOSITY)
		printf(RESULT_LINE);

	DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__);
	return 0;

}
Exemple #6
0
int main(int argc, char **argv) {
   
  // Set up MPI
  // ==========
  ierr = MPI_Init(&argc, &argv);
  ierr = MPI_Comm_rank(MPI_COMM_WORLD, &ThisTask);
  ierr = MPI_Comm_size(MPI_COMM_WORLD, &NTask);
#ifdef SINGLE_PRECISION
  fftwf_mpi_init();
#else
  fftw_mpi_init();
#endif

  if(argc < 2) {
    if(ThisTask == 0) {
      fprintf(stdout, "Input parameters not found\n");
      fprintf(stdout, "Call with <ParameterFile>\n");
    }
    ierr = MPI_Finalize();
    exit(0);
  }
   
  // Read the run parameters and setup code
  // ======================================
  int stepDistr;   
  int subtractLPT;
  double da=0;

  read_parameterfile(argv[1]);
  
  if (UseCOLA == 1){
    subtractLPT = 1; 
    stepDistr   = 0;
    StdDA       = 0;
  } else{
    subtractLPT = 0; 
    stepDistr   = 1;
    StdDA       = 2;
  }
  if (StdDA == 0){
    fullT = 1;
    nLPT  = -2.5;
  }
  filter = 0;              // Whether or not to smooth the forces
  Scale  = 2.*M_PI/Box;    // The force smoothing scale 

  if(ThisTask == 0) {
    printf("Run Parameters\n");
    printf("==============\n");
    printf("Cosmology:\n");
    printf("  Omega Matter(z=0) = %lf\n",Omega);
    printf("  Omega Baryon(z=0) = %lf\n",OmegaBaryon);
    printf("  Hubble Parameter(z=0) = %lf\n",HubbleParam);
    printf("  Sigma8(z=0) = %lf\n",Sigma8);
#ifndef GAUSSIAN
    printf("  F_nl = %lf\n",Fnl);
#endif
    printf("  Primordial Index = %lf\n",PrimordialIndex);
    printf("  Initial Redshift  = %lf\n",Init_Redshift);
    printf("  Final Redshift    = %lf\n",Final_Redshift);
#ifndef GAUSSIAN
    printf("  F_nl Redshift  = %lf\n",Fnl_Redshift);
#endif
    printf("Simulation:\n");
    printf("  Nmesh = %d\n", Nmesh);
    printf("  Nsample = %d\n", Nsample);
    printf("  Boxsize = %lf\n", Box);
    printf("  Buffer Size = %lf\n", Buffer);
    switch(WhichSpectrum) {
      case 0:
        switch (WhichTransfer) {
          case 1:
            printf("  Using Eisenstein & Hu Transfer Function\n");
            break;
          case 2:
            printf("  Using Tabulated Transfer Function\n");
            break;
          default:
            printf("  Using Efstathiou Transfer Function\n");
            break;
        }
        break;
      case 1:
        printf("  Using Eisenstein & Hu Power Spectrum\n");
        break;
      case 2:
        printf("  Using Tabulated Power Spectrum\n");
        break;   
      default:
        printf("  Using Efstathiou Power Spectrum\n");
        break;
    }      
    printf("  Number of Timesteps = %d\n",nsteps);
    if (UseCOLA) {
      printf("  Using COLA method\n\n");
    } else {
      printf("  Using Standard PM method\n\n");
    }
    fflush(stdout);
  }   
  
  // Initial and final scale factors:
  double ai=1.0/(1.0+Init_Redshift);
  double af=1.0/(1.0+Final_Redshift);
    
  if (stepDistr == 0) da=(af-ai)/((double)nsteps);
  if (stepDistr == 1) da=(log(af)-log(ai))/((double)nsteps);
  if (stepDistr == 2) da=(CosmoTime(af)-CosmoTime(ai))/((double)nsteps);

  set_units();

  if (ThisTask == 0) {
    printf("Initialising Transfer Function/Power Spectrum\n");
    printf("=============================================\n");
  }
  initialize_transferfunction();
  initialize_powerspectrum();
  initialize_ffts();
  initialize_parts();

  if(ThisTask == 0) {
    printf("Creating initial conditions\n");
    printf("===========================\n");
    fflush(stdout);
  }

  // Create the calculate the Zeldovich and 2LPT displacements and create the initial conditions
  // ===========================================================================================
  int i, j, k, m;
  unsigned int n, coord;
  double A=ai;                // This is the scale factor which we'll be advancing below.
  double Di=growthD(1.0, A);  // initial growth factor
  double Di2=growthD2(A);     // initial 2nd order growth factor  
  double Dv=DprimeQ(A,1.0);   // T[D_{za}]=dD_{za}/dy
  double Dv2=growthD2v(A);    // T[D_{2lpt}]=dD_{2lpt}/dy

  displacement_fields();
    
  P = (struct part_data *) malloc((int)(ceil(NumPart*Buffer))*sizeof(struct part_data));

  // Generate the initial particle positions and velocities
  // If subtractLPT = 0 (non-COLA), then velocity is ds/dy, which is simply the 2LPT IC.
  // Else set vel = 0 if we subtract LPT. This is the same as the action of the operator L_- from TZE, as initial velocities are in 2LPT.
  for(i=0; i<Local_np; i++) {
    for (j=0; j<Nsample; j++) {
      for (k=0; k<Nsample; k++) {
        coord = (i * Nsample + j) * Nsample + k;
           
        P[coord].ID = ((i + Local_p_start) * Nsample + j) * Nsample + k;
        for (m=0; m<3; m++) {
          P[coord].Dz[m] = ZA[m][coord];
          P[coord].D2[m] = LPT[m][coord];
          if (subtractLPT == 0) {
            P[coord].Vel[m]=P[coord].Dz[m]*Dv+P[coord].D2[m]*Dv2;
          } else {
            P[coord].Vel[m] = 0.0;
          }
        }

        P[coord].Pos[0] = periodic_wrap((i+Local_p_start)*(Box/Nsample)+P[coord].Dz[0]*Di+P[coord].D2[0]*Di2);
        P[coord].Pos[1] = periodic_wrap(j*(Box/Nsample)+P[coord].Dz[1]*Di+P[coord].D2[1]*Di2);
        P[coord].Pos[2] = periodic_wrap(k*(Box/Nsample)+P[coord].Dz[2]*Di+P[coord].D2[2]*Di2);
      }
    }
  }

  for (i=0; i<3; i++) {
    free(ZA[i]);
    free(LPT[i]);
  }

  // Now, we get to the N-Body part where we evolve with time via the Kick-Drift-Kick Method
  // =======================================================================================
  int timeStep;
  double AF=0,AI,AC,AFF=0;
  double growth1   = Di;
  double growth1L2 = Di2;

  // The density grid and force grids  and associated fftw plans
#ifndef MEMORY_MODE
  density = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N11  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N12  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  N13  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
  P3D  = (complex_kind*)density;
  FN11 = (complex_kind*)N11;
  FN12 = (complex_kind*)N12;
  FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
  plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
  plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
  p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

 if(ThisTask == 0) {
    printf("Beginning timestepping\n");
    printf("======================\n");
    fflush(stdout);
  }
  
  // AI stores the scale factor to which the velocities have been kicked to. Initially it's just A.
  AI=A;
  for (timeStep=0;timeStep<=nsteps;timeStep++){
    
    // AFF is the scale factor to which we should drift the particle positions.
    // AF is the scale factor to which we should kick the particle velocities.
    if (stepDistr == 0) AFF=A+da;
    if (stepDistr == 1) AFF=A*exp(da);
    if (stepDistr == 2) AFF=AofTime(CosmoTime(A)+da);

    // half time-step for final kick
    if (timeStep == nsteps) {
      AF=A; 
    } else { 
      // Set to mid-point of interval. In the infinitesimal timestep limit, these choices are identical. 
      // How one chooses the mid-point when not in that limit is really an extra degree of freedom in the code 
      // but Tassev et al. report negligible effects from the different choices below. 
      // Hence, this is not exported as an extra switch at this point.
      if (stepDistr == 0) AF=A+da*0.5;
      if (stepDistr == 1) AF=A*exp(da*0.5);
      if (stepDistr == 2) AF=AofTime((CosmoTime(AFF)+CosmoTime(A))*0.5); 
    }
    
    if (ThisTask == 0) {
      printf("Iteration = %d\n------------------\n",timeStep+1);
      printf("a = %lf\n",A);
      printf("z = %lf\n",1.0/A-1.0);
      fflush(stdout);
    }

    // First we check whether all the particles are on the correct processor after the last time step/
    // original 2LPT displacement and move them if not
    if (ThisTask == 0) printf("Moving particles across task boundaries...\n");
    MoveParticles();

#ifdef MEMORY_MODE
    density = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    P3D  = (complex_kind*)density;
#ifdef SINGLE_PRECISION
    plan = fftwf_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
    plan = fftw_mpi_plan_dft_r2c_3d(Nmesh,Nmesh,Nmesh,density,P3D,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif

    // Then we do the Cloud-in-Cell assignment to get the density grid and FFT it.  
    if (ThisTask == 0) printf("Calculating density using Cloud-in-Cell...\n");
    PtoMesh();

#ifdef MEMORY_MODE
    N11  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    N12  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    N13  = (float_kind *)calloc(2*Total_size,sizeof(float_kind));
    FN11 = (complex_kind*)N11;
    FN12 = (complex_kind*)N12;
    FN13 = (complex_kind*)N13;
#ifdef SINGLE_PRECISION
    p11  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p12  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p13  = fftwf_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#else
    p11  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN11,N11,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p12  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN12,N12,MPI_COMM_WORLD,FFTW_ESTIMATE);
    p13  = fftw_mpi_plan_dft_c2r_3d(Nmesh,Nmesh,Nmesh,FN13,N13,MPI_COMM_WORLD,FFTW_ESTIMATE);
#endif
#endif
    
    // This returns N11,N12,N13 which hold the components of
    // the vector (grad grad^{-2} density) on a grid.
    if (ThisTask == 0) printf("Calculating forces...\n");
    Forces(); 

#ifdef MEMORY_MODE
    free(density);
    for (i=0; i<3; i++) Disp[i] = (float *)malloc(NumPart*sizeof(float));
#ifdef SINGLE_PRECISION
    fftwf_destroy_plan(plan);
#else 
    fftw_destroy_plan(plan);
#endif
#else
    for (i=0; i<3; i++) Disp[i] = (float_kind *)malloc(NumPart*sizeof(float_kind));
#endif
    
    // Now find the accelerations at the particle positions using 3-linear interpolation. 
    if (ThisTask == 0) printf("Calculating accelerations...\n");
    MtoParticles();

#ifdef MEMORY_MODE
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif
    
    // Calculate the mean displacement and subtract later.
    if (ThisTask == 0) printf("Calculating mean of displacements...\n");
    double sumDx=0,sumDy=0,sumDz=0;
    for(n=0; n<NumPart; n++) {
      sumDx += Disp[0][n];
      sumDy += Disp[1][n];
      sumDz += Disp[2][n];
    }

    // Make sumDx, sumDy and sumDz global averages
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumDz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);  
    
    sumDx /= (double)TotNumPart; // We will subtract these below to conserve momentum. 
    sumDy /= (double)TotNumPart;
    sumDz /= (double)TotNumPart; 

    if (ThisTask == 0) {
      printf("Kicking the particles...\n");
      fflush(stdout);
    }

    // Kick
    // ===============
    double dda;
    double q1,q2;
    double ax,ay,az;
    double sumx=0,sumy=0,sumz=0; 
    double Om143=pow(Omega/(Omega+(1-Omega)*A*A*A),1./143.);
    
    if (StdDA == 0) {
      dda=Sphi(AI,AF,A);
    } else if (StdDA == 1) {
      dda=(AF-AI)*A/Qfactor(A);
    } else {
      dda=SphiStd(AI,AF);
    }  
    
    q2=1.5*Omega*growth1*growth1*(1.0+7./3.*Om143)*A; // T^2[D_{2lpt}]=d^2 D_{2lpt}/dy^2
    q1=1.5*Omega*growth1*A;                           // T^2[D_{ZA}]=d^2 D_{ZA}/dy^2
    
    for(n=0; n<NumPart; n++) {

      Disp[0][n] -= sumDx;
      Disp[1][n] -= sumDy;
      Disp[2][n] -= sumDz;

      ax=-1.5*Omega*Disp[0][n]-subtractLPT*(P[n].Dz[0]*q1+P[n].D2[0]*q2)/A;
      ay=-1.5*Omega*Disp[1][n]-subtractLPT*(P[n].Dz[1]*q1+P[n].D2[1]*q2)/A;
      az=-1.5*Omega*Disp[2][n]-subtractLPT*(P[n].Dz[2]*q1+P[n].D2[2]*q2)/A;

      P[n].Vel[0] += ax*dda;
      P[n].Vel[1] += ay*dda;
      P[n].Vel[2] += az*dda;

      sumx += P[n].Vel[0];
      sumy += P[n].Vel[1];
      sumz += P[n].Vel[2];
    }

    for (i=0; i<3; i++) free(Disp[i]);

    // Make sumx, sumy and sumz global averages
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumx,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumy,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
    ierr = MPI_Allreduce(MPI_IN_PLACE,&sumz,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);  
    
    sumx /= (double)TotNumPart;  // We will subtract these below to conserve momentum. 
    sumy /= (double)TotNumPart;  // Should be conserved, but just in case 3-linear interpolation makes a problem.
    sumz /= (double)TotNumPart;  // Never checked whether this makes a difference.

    if (timeStep == nsteps) {

      if (ThisTask == 0) {
        printf("Iteration %d finished\n------------------\n\n", timeStep+1);
        printf("Timestepping finished\n\n");
        fflush(stdout);
      }
    
      // At final timestep, add back LPT velocities if we had subtracted them. 
      // This corresponds to L_+ operator in TZE.
      Dv  = DprimeQ(A,1.0);  // dD_{za}/dy
      Dv2 = growthD2v(A);    // dD_{2lpt}/dy

      for(n=0; n<NumPart; n++) {
        P[n].Vel[0] += -sumx+(P[n].Dz[0]*Dv+P[n].D2[0]*Dv2)*subtractLPT;
        P[n].Vel[1] += -sumy+(P[n].Dz[1]*Dv+P[n].D2[1]*Dv2)*subtractLPT;
        P[n].Vel[2] += -sumz+(P[n].Dz[2]*Dv+P[n].D2[2]*Dv2)*subtractLPT;
      }

      goto finalize; // Sorry for "goto" :)
    }
    
    if (ThisTask == 0) {
      printf("Drifting the particles...\n");
      fflush(stdout);
    }

    // Drift
    // =============
    double dyyy;
    double da1,da2;

    AC = AF;
    AF = AFF;
    
    if (StdDA == 0) {
      dyyy=Sq(A,AF,AC);
    } else if (StdDA == 1) {
      dyyy=(AF-A)/Qfactor(AC);
    } else {
      dyyy=SqStd(A,AF);
    }

    da1=growthD(1.0, AF)-growth1;    // change in D
    da2=growthD2(AF)-growth1L2; // change in D_{2lpt}
    
    for(n=0; n<NumPart; n++) {
        P[n].Pos[0] += (P[n].Vel[0]-sumx)*dyyy;
        P[n].Pos[1] += (P[n].Vel[1]-sumy)*dyyy;
        P[n].Pos[2] += (P[n].Vel[2]-sumz)*dyyy;

        P[n].Pos[0] = periodic_wrap(P[n].Pos[0]+subtractLPT*(P[n].Dz[0]*da1+P[n].D2[0]*da2));
        P[n].Pos[1] = periodic_wrap(P[n].Pos[1]+subtractLPT*(P[n].Dz[1]*da1+P[n].D2[1]*da2));
        P[n].Pos[2] = periodic_wrap(P[n].Pos[2]+subtractLPT*(P[n].Dz[2]*da1+P[n].D2[2]*da2));
    }

    // Step in time
    // ================
    A  = AF;   // WRT to the above name change, A  = AFF
    AI = AC;   // WRT to the above name change, AI = AF

    growth1   = growthD(1.0, A);
    growth1L2 = growthD2(A);

    if (ThisTask == 0) {
      printf("Iteration %d finished\n------------------\n\n", timeStep+1);
      fflush(stdout);
    }
     
    ierr = MPI_Barrier(MPI_COMM_WORLD);

  }

  // Here is the last little bit
  // ===========================
  finalize:

  if (ThisTask == 0) {
    printf("Finishing up\n");
    printf("============\n");
    fflush(stdout);
  }
    
  // Now convert velocities to v_{rsd}\equiv (ds/d\eta)/(a H(a))
  velRSD(A);
    
  // Output a slice just for the sake of doing something with P.
  if (ThisTask == 0) {
    printf("Converting to RSD velocities...\n");
    printf("Outputting particles...\n"); 
  }
  slice();
  print_spec();
  fflush(stdout);

  free_powertable();
  free_transfertable();
#ifdef GENERIC_FNL
  free(KernelTable);
#endif

  free(P);
  free(Slab_to_task);
  free(Part_to_task);
  free(Local_nx_table);
  free(Local_np_table);
#ifndef MEMORY_MODE
  free(density);
  free(N11);
  free(N12);
  free(N13);  
#ifdef SINGLE_PRECISION
  fftwf_destroy_plan(plan);
  fftwf_destroy_plan(p11);
  fftwf_destroy_plan(p12);
  fftwf_destroy_plan(p13);
#else
  fftw_destroy_plan(plan);
  fftw_destroy_plan(p11);
  fftw_destroy_plan(p12);
  fftw_destroy_plan(p13);
#endif
#endif

#ifdef SINGLE_PRECISION
  fftwf_mpi_cleanup();
#else
  fftw_mpi_cleanup();
#endif

  if (ThisTask == 0) printf("Done :)\n");

  MPI_Finalize();   

  return 0;
}