Пример #1
0
/*
 * This test attempts collective communication after a process in
 * the communicator has failed.
 */
int main(int argc, char **argv)
{
    int rank, size, i, rc, errclass, toterrs, errs = 0;
    char rbuf[100000];
    char *sendbuf;
    int deadprocs[1] = {1};
    MPI_Group world, newgroup;
    MPI_Comm newcomm;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

    if (size < 3) {
        fprintf( stderr, "Must run with at least 3 processes\n" );
        MPI_Abort( MPI_COMM_WORLD, 1 );
    }

    MPI_Comm_group(MPI_COMM_WORLD, &world);
    MPI_Group_excl(world, 1, deadprocs, &newgroup);
    MPI_Comm_create_group(MPI_COMM_WORLD, newgroup, 0, &newcomm);

    if (rank == 1) {
        exit(EXIT_FAILURE);
    }

    /* try a small send first */
    sendbuf = (char *)malloc(10*size*sizeof(char));

    if (rank == 0) {
      for (i=0;i<size;i++) {
          strcpy(sendbuf + i*10, "No Errors");
      }
    }

    rc = MPI_Scatter(sendbuf, 10, MPI_CHAR, rbuf, 10, MPI_CHAR, 0, MPI_COMM_WORLD);

#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
    MPI_Error_class(rc, &errclass);
    if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) {
        fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
        errs++;
    }
#endif

    /* reset the buffers and try a larger scatter */
    free(sendbuf);
    memset(rbuf, 0, sizeof(rbuf));
    sendbuf = (char *)malloc(100000*size*sizeof(char));

    if (rank == 0) {
      for (i=0;i<size;i++) {
          strcpy(sendbuf + i*100000, "No Errors");
      }
    }

    rc = MPI_Scatter(sendbuf, 100000, MPI_CHAR, rbuf, 100000, MPI_CHAR, 0, MPI_COMM_WORLD);

#if defined (MPICH) && (MPICH_NUMVERSION >= 30100102)
    MPI_Error_class(rc, &errclass);
    if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) {
        fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass);
        errs++;
    }
#endif

    rc = MPI_Reduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, 0, newcomm);
    if(rc)
        fprintf(stderr, "Failed to get errors from other processes\n");

    if (rank == 0) {
        if (toterrs) {
            printf( " Found %d errors\n", toterrs );
        }
        else {
            printf( " No Errors\n" );
        }
        fflush(stdout);
    }

    free(sendbuf);

    MPI_Comm_free(&newcomm);
    MPI_Group_free(&newgroup);
    MPI_Group_free(&world);

    MPI_Finalize();

    return 0;
}
Пример #2
0
int main(int argc, char **argv)
{
	int rank;
	int world_size;

	/*
	 *	Initialization
	 */
	int thread_support;
	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
		fprintf(stderr,"MPI_Init_thread failed\n");
		exit(1);
	}
	if (thread_support == MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
	if (thread_support < MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI does not have thread support!\n");

	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);

	starpu_srand48((long int)time(NULL));

	parse_args(rank, argc, argv);

	int ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	/* We disable sequential consistency in this example */
	starpu_data_set_default_sequential_consistency_flag(0);

	starpu_mpi_init(NULL, NULL, 0);

	STARPU_ASSERT(p*q == world_size);

	starpu_cublas_init();

	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	/*
	 * 	Problem Init
	 */

	init_matrix(rank);

	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
                        (int)(allocated_memory/(1024*1024)),
			(int)(allocated_memory_extra/(1024*1024)),
                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));

	display_grid(rank, nblocks);

	TYPE *a_r = NULL;
//	STARPU_PLU(display_data_content)(a_r, size);

	TYPE *x, *y;

	if (check)
	{
		x = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(x);

		y = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			unsigned ind;
			for (ind = 0; ind < size; ind++)
				x[ind] = (TYPE)starpu_drand48();
		}

		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);

		if (rank == 0)
			STARPU_PLU(display_data_content)(a_r, size);

//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
	}

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);

	/*
	 * 	Report performance
	 */

	int reduce_ret;
	double min_timing = timing;
	double max_timing = timing;
	double sum_timing = timing;

	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	if (rank == 0)
	{
		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));

		unsigned n = size;
		double flop = (2.0f*n*n*n)/3.0f;
		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
	}

	/*
	 *	Test Result Correctness
	 */

	if (check)
	{
		/*
		 *	Compute || A - LU ||
		 */

		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);

#if 0
		/*
		 *	Compute || Ax - LUx ||
		 */

		unsigned ind;

		y2 = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			for (ind = 0; ind < size; ind++)
			{
				y2[ind] = (TYPE)0.0;
			}
		}

		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);

		/* Compute y2 = y2 - y */
		CPU_AXPY(size, -1.0, y, 1, y2, 1);

		TYPE err = CPU_ASUM(size, y2, 1);
		int max = CPU_IAMAX(size, y2, 1);

		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
#endif
	}

	/*
	 * 	Termination
	 */

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	starpu_cublas_shutdown();
	starpu_mpi_shutdown();
	starpu_shutdown();

#if 0
	MPI_Finalize();
#endif

	return 0;
}
Пример #3
0
int main(int argc, char** argv)
{
       	int rank;
	int size;
	MPI_Init(&argc, &argv);
	
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
        
	if(argc != 5) {
          fprintf(stderr, "argc %d\n", argc);
          fprintf(stderr, 
              "Usage: %s <left> <right> <numSyncFiles> <tolerance>\n"
              "where <left> and <right> are different"
              "N-procs_case directories\n", argv[0]);
          MPI_Finalize();
          return 1;
        }
	char* lpath = argv[1];
	char* rpath = argv[2];
        int nSyncFiles = atoi(argv[3]);
        double tolerance = atof(argv[4]);

	int ndof;
	int nshg;
	int solsize;
	double* solution;

	std::set<int>* l_timesteps = find_timesteps(lpath, nSyncFiles);
	std::set<int>* r_timesteps = find_timesteps(rpath, nSyncFiles);
	std::set<int>* timesteps_to_check = new std::set<int>;
	std::set_intersection(l_timesteps->begin(), l_timesteps->end(),
			r_timesteps->begin(), r_timesteps->end(),
			std::inserter(*timesteps_to_check, timesteps_to_check->begin()));
        delete l_timesteps;
        delete r_timesteps;
	if(rank == 0)
		printf("Found %d common timesteps\n",
			       	timesteps_to_check->size());
#ifdef DBGONLY
	read_solution(&solution, &solsize, &nshg, &ndof, 
            size, rank, 0, numSyncFiles, "./");
	printf("nshg: %d, ndof: %d\n", nshg, ndof);
	assert(solsize == ndof*nshg);
#endif
	double maxerror = 0.0;
	double error;
	double gblmaxerror;
	for(std::set<int>::iterator i = timesteps_to_check->begin();
			i!=timesteps_to_check->end();i++)
	{
		error = compare_solution(lpath, rpath, *i, size, nSyncFiles);
		if(error>maxerror) maxerror = error;
	}
        delete timesteps_to_check;
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Reduce(&maxerror, &gblmaxerror, 1, MPI_DOUBLE, MPI_MAX, 0,
		MPI_COMM_WORLD);
	if(rank == 0) printf("Maximum difference across all timesteps: %e\n", 
			gblmaxerror);
	MPI_Finalize();
        return (gblmaxerror > tolerance);
}
Пример #4
0
void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){

  int nx,ny,nz,procid,nprocs,ndim;
  int istart[3],isize[3],iend[3];
  int fstart[3],fsize[3],fend[3];
  int p3dfft_mem_conf,nrep;
  long int Nlocal,Nglob;
  double factor;
  double l_timers[12]={0},g_timers[12]={0};
  double total_time=0*MPI_Wtime(), setup_time=0;
  // rtime_local is timings on each process and _global is the max reduced to root
  // 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time
  // The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root.
  // the sum of first four elements give the comm time
  unsigned char op_f[4]="fft", op_b[4]="tff";
  int memsize[3];

  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&procid);



  nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP;

  if(decomp==1){
    dims[0] = 1; dims[1] = nprocs;
  }
  if(procid == 0)
    printf("Using processor grid %d x %d\n",dims[0],dims[1]);

  /* Initialize P3DFFT */
  MPI_Barrier(MPI_COMM_WORLD);
  setup_time -= MPI_Wtime(); //Compute Setup Time.
  Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize);
  setup_time +=  MPI_Wtime(); //Compute Setup Time.
  PCOUT<<"done with setup"<<std::endl;

  Cp3dfft_get_dims(istart,iend,isize,1);
  Cp3dfft_get_dims(fstart,fend,fsize,2);
  /* Allocate and initialize */

  double *A; // Input matrix A
  A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));
  //B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2));

  /* Warmup */
  Cp3dfft_ftran_r2c(A,A,op_f);
  Cp3dfft_ftran_r2c(A,A,op_f);
  MPI_Barrier(MPI_COMM_WORLD);
  Cset_timers();

  for (int rep=0; rep<nrep; rep++){
    initialize_p3dfft(A,n);

    MPI_Barrier(MPI_COMM_WORLD);

    /* Forward transform */
    total_time -=  MPI_Wtime();
    Cp3dfft_ftran_r2c(A,A,op_f);
    total_time +=  MPI_Wtime();

    MPI_Barrier(MPI_COMM_WORLD);
  }

  Cget_timers(l_timers);
  Cp3dfft_btran_c2r(A,A,op_b);

  /* Compute Error */
  //PCOUT<<"Done With FFTs computing error"<<std::endl;
  compute_error_p3dfft(A,n);

  /* Gather timing statistics */
  double g_total_time, g_comm_time, g_setup_time;

  MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
  MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);


  g_total_time=g_total_time/nrep;
  g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep);
  //g_total_time=g_total_time/((double)nrep);
  ptrdiff_t size=n[0];size*=n[1]; size*=n[2];
  double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9;

  if(procid == 0){
    std::cout.precision(4);
    std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;;
    std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl;
    std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl;
    std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl;
    std::cout<<"\033[1;31m";
    std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl;
    std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
    std::cout<<"\033[0m\n"<<std::endl;

    results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl;
  }
  /* Free work space */
  fftw_free(A);
  Cp3dfft_clean();

}
Пример #5
0
int olsr_main(int argc, char *argv[])
{
    int i;
    char log[32];

    tw_opt_add(olsr_opts);
    tw_init(&argc, &argv);

#if DEBUG
    sprintf( log, "olsr-log.%ld", g_tw_mynode );

    olsr_event_log = fopen( log, "w+");
    if( olsr_event_log == nullptr )
        tw_error( TW_LOC, "Failed to Open OLSR Event Log file \n");
#endif

    g_tw_mapping = CUSTOM;
    g_tw_custom_initial_mapping = &olsr_custom_mapping;
    g_tw_custom_lp_global_to_local_map = &olsr_mapping_to_lp;

    // nlp_per_pe = OLSR_MAX_NEIGHBORS;// / tw_nnodes();
    //g_tw_lookahead = SA_INTERVAL;

    SA_range_start = nlp_per_pe;

    // Increase nlp_per_pe by nlp_per_pe / OMN
    nlp_per_pe += nlp_per_pe / OLSR_MAX_NEIGHBORS;

    g_tw_events_per_pe =  OLSR_MAX_NEIGHBORS / 2 * nlp_per_pe  + 65536;
    tw_define_lps(nlp_per_pe, sizeof(olsr_msg_data));

    for(i = 0; i < OLSR_END_EVENT; i++)
        g_olsr_event_stats[i] = 0;

    for (i = 0; i < SA_range_start; i++) {
        tw_lp_settype(i, &olsr_lps[0]);
    }

    for (i = SA_range_start; i < nlp_per_pe; i++) {
        tw_lp_settype(i, &olsr_lps[1]);
    }

#if DEBUG
    printf("g_tw_nlp is %llu\n", g_tw_nlp);
#endif

    tw_run();

    if( g_tw_synchronization_protocol != 1 )
    {
        MPI_Reduce( g_olsr_event_stats, g_olsr_root_event_stats, OLSR_END_EVENT, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
    }
    else {
        for (i = 0; i < OLSR_END_EVENT; i++) {
            g_olsr_root_event_stats[i] = g_olsr_event_stats[i];
        }
    }

    if (tw_ismaster()) {
        for( i = 0; i < OLSR_END_EVENT; i++ )
            printf("OLSR Type %s Event Count = %llu \n", event_names[i], g_olsr_root_event_stats[i]);
        printf("Complete.\n");
    }

    tw_end();

    return 0;
}
Пример #6
0
int main( int argc, char *argv[] )
{
  int opt;
  extern char   *optarg;
  extern int     optind;
  int is_output_timing=0, is_print_usage = 0;
  int _debug=0, use_gen_file = 0, use_actsto = 0, use_normalsto=0;
  char *token;

  MPI_Offset disp, offset, file_size;
  MPI_Datatype etype, ftype, buftype;

  int errs = 0;
  int size, rank, i, count;
  char *fname = NULL;
  double *buf;
  MPI_File fh;
  MPI_Comm comm;
  MPI_Status status;
  int64_t nitem = 0;
  int fsize = 0, type_size;
  double stime, etime, iotime, comptime, elapsed_time;
  double max_iotime, max_comptime;

  double max, min, sum=0.0, global_sum;

  MPI_Init( &argc, &argv );
 
  comm = MPI_COMM_WORLD;

  MPI_Comm_size( comm, &size );
  MPI_Comm_rank( comm, &rank );
 
  while ( (opt=getopt(argc,argv,"i:s:godhxt"))!= EOF) {
    switch (opt) {
    case 'i': fname = optarg;
      break;
    case 'o': is_output_timing = 1;
      break;
    case 'g': use_gen_file = 1;
      break;
    case 'd': _debug = 1;
      break;
    case 'h': is_print_usage = 1;
      break;
    case 's': 
        token = strtok(optarg, ":");
        //if (rank == 0) printf("token=%s\n", token);
        if(token == NULL) {
            if (rank == 0) printf("1: Wrong file size format!\n");
            MPI_Finalize();
            exit(1);
        }

        fsize = atoi(token);
        token = strtok(NULL, ":");
        //if (rank == 0) printf("token=%s\n", token);
        if(token == NULL) {
            if (rank == 0) printf("2: Wrong file size format!\n");
            MPI_Finalize();
            exit(1);
        }
        if(*token != 'm' && *token != 'g') {
            if (rank == 0) printf("3: Wrong file size format!\n");
            MPI_Finalize();
            exit(1);
        }
        if (rank ==0) printf("fsize = %d (%s)\n", fsize, (*token=='m'?"MB":"GB"));
      if (fsize == 0)
	nitem = 0;
      else {
	MPI_Type_size(MPI_DOUBLE, &type_size);
	nitem = fsize*1024; /* KB */
	nitem = nitem*1024; /* MB */
        if(*token == 'g') {
            //if(rank == 0) printf("data in GB\n");
            nitem = nitem*1024; /* GB */
        }
	nitem = nitem/type_size;
	//printf("nitem=%lld\n", nitem);
	nitem = nitem/size; /* size means comm size */
      }
      if (rank == 0) printf("nitem = %d\n", nitem);
      break;
    case 'x': use_actsto = 1;
      break;
    case 't': use_normalsto = 1;
      break;
    default: is_print_usage = 1;
      break;
    }
  }

  if (fname == NULL || is_print_usage == 1 || nitem == 0) {
    if (rank == 0) usage(argv[0]);
    MPI_Finalize();
    exit(1);
  }

  int sizeof_mpi_offset;
  sizeof_mpi_offset = (int)(sizeof(MPI_Offset)); // 8 
  //if (rank == 0) printf ("size_of_mpi_offset=%d\n", sizeof_mpi_offset);

  if(use_normalsto == 1 && use_actsto == 1) {
      if(rank == 0)
          printf("Can't test both: either normalsto or actsto\n");
      MPI_Finalize();
      exit(1);
  }
#if 0
  if(use_actsto == 1) {
      if (size != 1) {
          if(rank == 0)
              printf("active storage should be run with only 1 process!!!\n");
          MPI_Finalize();
          exit(1);
      }
  }
#endif
  /* initialize random seed: */
  srand(time(NULL));

  if(use_gen_file == 1) {
    int t, result;

    MPI_File_open( comm, fname, MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh );

    /* Set the file view */
    disp = rank * nitem * type_size;
    printf("%d: disp = %lld\n", rank, disp);
    etype = MPI_DOUBLE;
    ftype = MPI_DOUBLE;

    result = MPI_File_set_view(fh, disp, etype, ftype, "native", MPI_INFO_NULL);

    if(result != MPI_SUCCESS) 
      sample_error(result, "MPI_File_set_view");

    buf = (double *)malloc( nitem * sizeof(double) );

    if (buf == NULL) {
        if(rank == 0) printf("malloc() failed\n");
        MPI_Finalize();
        exit(1);
    }

    buf[0] = rand()%4096;
    if(rank==0) printf("%lf\n", buf[0]);
    max = min = sum = buf[0];

    for(i=1; i<nitem; i++) {
      t = rand()%4096;
      if (t>max) max = t;
      if (t<min) min = t;
      sum += t;
      buf[i] = t;
      if (i<10 && rank == 0) printf("%lf\n", buf[i]);
    }
    
    if(rank == 0) {
      printf("MPI_Type_size(MPI_DOUBLE)=%d\n", type_size);
      printf ("max=%lf, min=%lf, sum=%lf\n", max, min, sum);
    }

    stime = MPI_Wtime();
    /* Write to file */
    MPI_File_write_all( fh, buf, nitem, MPI_DOUBLE, &status );
    etime = MPI_Wtime();
    iotime = etime - stime;
      
    printf("%d: iotime (write) = %10.4f\n", rank, iotime);

    MPI_Get_count( &status, MPI_DOUBLE, &count );
    //printf("count = %lld\n", count);

    if (count != nitem) {
      fprintf( stderr, "%d: Wrong count (%lld) on write\n", rank, count );
      fflush(stderr);
      /* exit */
      MPI_Finalize();
      exit(1);
    }

    MPI_File_close(&fh);
    MPI_Barrier(MPI_COMM_WORLD);
    if(rank == 0) printf("File is written\n\n");
  }

  double *tmp = (double *)malloc( nitem * sizeof(double) );
  memset (tmp, 0, nitem*sizeof(double));

  if(use_normalsto == 1) {
      MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh );
      /* Read nothing (check status) */
      memset( &status, 0xff, sizeof(MPI_Status) );
      
      offset = rank * nitem * type_size;

      /* start I/O */
      stime = MPI_Wtime();
      MPI_File_read_at(fh, offset, tmp, nitem, MPI_DOUBLE, &status);
      etime = MPI_Wtime();
      /* end I/O */
      iotime = etime - stime;
      
      if(_debug==1) printf("%d: iotime = %10.4f\n", rank, iotime);
      MPI_Reduce(&iotime, &max_iotime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      
      sum = 0.0; /* reset sum */
      
      /* start computation */
      stime = MPI_Wtime();
      
      for(i=0; i<nitem; i++) {
          sum += tmp[i];
      }
      
      MPI_Reduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
      etime = MPI_Wtime();
      /* end computation */

      comptime = etime - stime;

      if(_debug==1) printf("%d: comptime = %10.4f\n", rank, comptime);
      
      MPI_Reduce(&comptime, &max_comptime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

      if(rank == 0) {
          elapsed_time = max_comptime + max_iotime;
          printf("<<Result (SUM) with normal read>>\n"
                 "SUM              = %10.4f \n"
                 "Computation time = %10.4f sec\n"
                 "I/O time         = %10.4f sec\n"
                 "total time       = %10.4f sec\n\n", 
                 global_sum, max_comptime, max_iotime, elapsed_time);
      }
      
      MPI_File_close(&fh);
  }
#if 0
  if(use_actsto == 1) {
#if 0
    /* MPI_MAX */
    MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh );

    stime = MPI_Wtime();
    MPI_File_read_at_ex( fh, offset, tmp, nitem, MPI_DOUBLE, MPI_MAX, &status );
    etime = MPI_Wtime();
    elapsed_time = etime-stime;
    printf ("<<Result with active storage>>\n"
	    "max=%lf (in %10.4f sec)\n", tmp[0], elapsed_time);
    
    MPI_File_close(&fh);
    
    /* MPI_MIN */
    MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh );
    
    stime = MPI_Wtime();
    MPI_File_read_at_ex( fh, offset, tmp, nitem, MPI_DOUBLE, MPI_MIN, &status );
    etime = MPI_Wtime();
    elapsed_time = etime - stime;
    printf ("min=%lf (in %10.4f sec)\n", tmp[0], elapsed_time); 
    
    MPI_File_close(&fh);
#endif

    /* MPI_SUM */
    MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh );
    memset(&status, 0xff, sizeof(MPI_Status));
    offset = rank * nitem * type_size;
    
    stime = MPI_Wtime();
    MPI_File_read_at_ex( fh, offset, tmp, nitem, MPI_DOUBLE, MPI_SUM, &status );
    etime = MPI_Wtime();
    elapsed_time = etime - stime;
    printf ("<<Result with active storage>>\n"
            "sum=%lf (in %10.4f sec)\n", tmp[0], elapsed_time); 
    
    MPI_File_close( &fh );
  }
#endif
  MPI_Barrier(MPI_COMM_WORLD);
  if (use_gen_file == 1) free( buf );
  free( tmp );
 
  MPI_Finalize();
  return errs;
}
Пример #7
0
void trainOneEpochDenseCPU(int itask, float *data, float *numerator,
                           float *denominator, float *codebook,
                           unsigned int nSomX, unsigned int nSomY,
                           unsigned int nDimensions, unsigned int nVectors,
                           unsigned int nVectorsPerRank, float radius,
                           float scale, string mapType, string gridType,
                           bool compact_support, bool gaussian, int *globalBmus) {
    unsigned int p1[2] = {0, 0};
    unsigned int *bmus = new unsigned int[nVectorsPerRank * 2];
#ifdef _OPENMP
    #pragma omp parallel default(shared) private(p1)
#endif
    {
#ifdef _OPENMP
        #pragma omp for
#endif
#ifdef _WIN32
        for (int n = 0; n < nVectorsPerRank; n++) {
#else
        for (unsigned int n = 0; n < nVectorsPerRank; n++) {
#endif
            if (itask * nVectorsPerRank + n < nVectors) {
                /// get the best matching unit
                get_bmu_coord(codebook, data, nSomY, nSomX,
                              nDimensions, p1, n);
                bmus[2 * n] = p1[0];
                bmus[2 * n + 1] = p1[1];
            }
        }
    }

    float *localNumerator = new float[nSomY * nSomX * nDimensions];
    float *localDenominator = new float[nSomY * nSomX];
#ifdef _OPENMP
    #pragma omp parallel default(shared)
#endif
    {
#ifdef _OPENMP
        #pragma omp for
#endif
#ifdef _WIN32
        for (int som_y = 0; som_y < nSomY; som_y++) {
#else
        for (unsigned int som_y = 0; som_y < nSomY; som_y++) {
#endif
            for (unsigned int som_x = 0; som_x < nSomX; som_x++) {
                localDenominator[som_y * nSomX + som_x] = 0.0;
                for (unsigned int d = 0; d < nDimensions; d++)
                    localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] = 0.0;
            }
        }
        /// Accumulate denoms and numers
#ifdef _OPENMP
        #pragma omp for
#endif
#ifdef _WIN32
        for (int som_y = 0; som_y < nSomY; som_y++) {
#else
        for (unsigned int som_y = 0; som_y < nSomY; som_y++) {
#endif
            for (unsigned int som_x = 0; som_x < nSomX; som_x++) {
                for (unsigned int n = 0; n < nVectorsPerRank; n++) {
                    if (itask * nVectorsPerRank + n < nVectors) {
                        float dist = 0.0f;
                        if (gridType == "rectangular") {
                            if (mapType == "planar") {
                                dist = euclideanDistanceOnPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]);
                            }
                            else if (mapType == "toroid") {
                                dist = euclideanDistanceOnToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY);
                            }
                        }
                        else {
                            if (mapType == "planar") {
                                dist = euclideanDistanceOnHexagonalPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]);
                            }
                            else if (mapType == "toroid") {
                                dist = euclideanDistanceOnHexagonalToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY);
                            }
                        }
                        float neighbor_fuct = getWeight(dist, radius, scale, compact_support, gaussian);

                        for (unsigned int d = 0; d < nDimensions; d++) {
                            localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] +=
                                1.0f * neighbor_fuct
                                * (*(data + n * nDimensions + d));
                        }
                        localDenominator[som_y * nSomX + som_x] += neighbor_fuct;
                    }
                }
            }
        }
    }
#ifdef HAVE_MPI
    MPI_Reduce(localNumerator, numerator,
               nSomY * nSomX * nDimensions, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(localDenominator, denominator,
               nSomY * nSomX, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Gather(bmus, nVectorsPerRank * 2, MPI_INT, globalBmus, nVectorsPerRank * 2, MPI_INT, 0, MPI_COMM_WORLD);

#else
    for (unsigned int i = 0; i < nSomY * nSomX * nDimensions; ++i) {
        numerator[i] = localNumerator[i];
    }
    for (unsigned int i = 0; i < nSomY * nSomX; ++i) {
        denominator[i] = localDenominator[i];
    }
    for (unsigned int i = 0; i < 2 * nVectorsPerRank; ++i) {
        globalBmus[i] = bmus[i];
    }
#endif
    delete [] bmus;
    delete [] localNumerator;
    delete [] localDenominator;
}
Пример #8
0
int main( int argc, char *argv[] )
{
    int num_errors = 0, total_num_errors = 0;
    int rank, size;
    char port1[MPI_MAX_PORT_NAME];
    char port2[MPI_MAX_PORT_NAME];
    char port3[MPI_MAX_PORT_NAME];
    MPI_Status status;
    MPI_Comm comm1, comm2, comm3;
    int verbose = 0;
    int data = 0;

    if (getenv("MPITEST_VERBOSE"))
    {
	verbose = 1;
    }

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (size < 4)
    {
	printf("Four processes needed to run this test.\n");
	MPI_Finalize();
	return 0;
    }

    if (rank == 0)
    {
	IF_VERBOSE(("0: opening ports.\n"));
	MPI_Open_port(MPI_INFO_NULL, port1);
	MPI_Open_port(MPI_INFO_NULL, port2);
	MPI_Open_port(MPI_INFO_NULL, port3);

	IF_VERBOSE(("0: opened port1: <%s>\n", port1));
	IF_VERBOSE(("0: opened port2: <%s>\n", port2));
	IF_VERBOSE(("0: opened port3: <%s>\n", port3));
	IF_VERBOSE(("0: sending ports.\n"));
	MPI_Send(port1, MPI_MAX_PORT_NAME, MPI_CHAR, 1, 0, MPI_COMM_WORLD);
	MPI_Send(port2, MPI_MAX_PORT_NAME, MPI_CHAR, 2, 0, MPI_COMM_WORLD);
	MPI_Send(port3, MPI_MAX_PORT_NAME, MPI_CHAR, 3, 0, MPI_COMM_WORLD);

	IF_VERBOSE(("0: accepting port3.\n"));
	MPI_Comm_accept(port3, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm3);
	IF_VERBOSE(("0: accepting port2.\n"));
	MPI_Comm_accept(port2, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm2);
	IF_VERBOSE(("0: accepting port1.\n"));
	MPI_Comm_accept(port1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm1);

	IF_VERBOSE(("0: closing ports.\n"));
	MPI_Close_port(port1);
	MPI_Close_port(port2);
	MPI_Close_port(port3);

	IF_VERBOSE(("0: sending 1 to process 1.\n"));
	data = 1;
	MPI_Send(&data, 1, MPI_INT, 0, 0, comm1);

	IF_VERBOSE(("0: sending 2 to process 2.\n"));
	data = 2;
	MPI_Send(&data, 1, MPI_INT, 0, 0, comm2);

	IF_VERBOSE(("0: sending 3 to process 3.\n"));
	data = 3;
	MPI_Send(&data, 1, MPI_INT, 0, 0, comm3);

	IF_VERBOSE(("0: disconnecting.\n"));
	MPI_Comm_disconnect(&comm1);
	MPI_Comm_disconnect(&comm2);
	MPI_Comm_disconnect(&comm3);
    }
    else if (rank == 1)
    {
	IF_VERBOSE(("1: receiving port.\n"));
	MPI_Recv(port1, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);

	IF_VERBOSE(("1: received port1: <%s>\n", port1));
	IF_VERBOSE(("1: connecting.\n"));
	MPI_Comm_connect(port1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm1);

	MPI_Recv(&data, 1, MPI_INT, 0, 0, comm1, &status);
	if (data != 1)
	{
	    printf("Received %d from root when expecting 1\n", data);
	    fflush(stdout);
	    num_errors++;
	}

	IF_VERBOSE(("1: disconnecting.\n"));
	MPI_Comm_disconnect(&comm1);
    }
    else if (rank == 2)
    {
	IF_VERBOSE(("2: receiving port.\n"));
	MPI_Recv(port2, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);

	IF_VERBOSE(("2: received port2: <%s>\n", port2));
	/* make sure process 1 has time to do the connect before this process 
	   attempts to connect */
	MTestSleep(2);
	IF_VERBOSE(("2: connecting.\n"));
	MPI_Comm_connect(port2, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm2);

	MPI_Recv(&data, 1, MPI_INT, 0, 0, comm2, &status);
	if (data != 2)
	{
	    printf("Received %d from root when expecting 2\n", data);
	    fflush(stdout);
	    num_errors++;
	}

	IF_VERBOSE(("2: disconnecting.\n"));
	MPI_Comm_disconnect(&comm2);
    }
    else if (rank == 3)
    {
	IF_VERBOSE(("3: receiving port.\n"));
	MPI_Recv(port3, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);

	IF_VERBOSE(("2: received port2: <%s>\n", port2));
	/* make sure process 1 and 2 have time to do the connect before this 
	   process attempts to connect */
	MTestSleep(4);
	IF_VERBOSE(("3: connecting.\n"));
	MPI_Comm_connect(port3, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm3);

	MPI_Recv(&data, 1, MPI_INT, 0, 0, comm3, &status);
	if (data != 3)
	{
	    printf("Received %d from root when expecting 3\n", data);
	    fflush(stdout);
	    num_errors++;
	}

	IF_VERBOSE(("3: disconnecting.\n"));
	MPI_Comm_disconnect(&comm3);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Reduce(&num_errors, &total_num_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
    if (rank == 0)
    {
	if (total_num_errors)
	{
	    printf(" Found %d errors\n", total_num_errors);
	}
	else
	{
	    printf(" No Errors\n");
	}
	fflush(stdout);
    }
    MPI_Finalize();
    return total_num_errors;
}
Пример #9
0
int main(int argc, char *argv[])
{
    int i, numprocs, rank, size, align_size, disp;
    int skip;
    double latency = 0.0, t_start = 0.0, t_stop = 0.0;
    double timer=0.0;
    double avg_time = 0.0, max_time = 0.0, min_time = 0.0; 
    char *sendbuf, *recvbuf, *s_buf1, *r_buf1;
    int *rdispls=NULL, *recvcounts=NULL;
    int max_msg_size = 1048576, full = 0;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);

    if (process_args(argc, argv, rank, &max_msg_size, &full)) {
        MPI_Finalize();
        return EXIT_SUCCESS;
    }

    if(numprocs < 2) {
        if(rank == 0) {
            fprintf(stderr, "This test requires at least two processes\n");
        }

        MPI_Finalize();

        return EXIT_FAILURE;
    }

    print_header(rank, full);

    recvcounts=rdispls=NULL;
    recvcounts = (int *) malloc (numprocs*sizeof(int));
    if(NULL == recvcounts) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }
    
    rdispls = (int *) malloc (numprocs*sizeof(int));
    if(NULL == rdispls) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }

    s_buf1 = r_buf1 = NULL;

    s_buf1 = (char *) malloc(sizeof(char)*max_msg_size + MAX_ALIGNMENT);
    if(NULL == s_buf1) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }
    
    r_buf1 = (char *) malloc(sizeof(char)*max_msg_size * numprocs + MAX_ALIGNMENT);
    if(NULL == r_buf1) {
        fprintf(stderr, "malloc failed.\n");
        exit(1);
    }


    align_size = getpagesize();

    sendbuf = (char *)(((unsigned long) s_buf1 + (align_size - 1)) / align_size
                    * align_size);
    recvbuf = (char *)(((unsigned long) r_buf1 + (align_size - 1)) / align_size
                    * align_size);

    memset(sendbuf, 1, max_msg_size);
    memset(recvbuf, 0, max_msg_size * numprocs);

    for(size=1; size <= max_msg_size; size *= 2) {

        if(size > LARGE_MESSAGE_SIZE) {
            skip = SKIP_LARGE;
            iterations = iterations_large;
        } else {
            skip = SKIP;
            
        }

        MPI_Barrier(MPI_COMM_WORLD);

        disp =0;
        for ( i = 0; i < numprocs; i++) {
            recvcounts[i] = size;
            rdispls[i] = disp;
            disp += size;
        }

        MPI_Barrier(MPI_COMM_WORLD);       
        timer=0.0;
        for(i=0; i < iterations + skip ; i++) {

            t_start = MPI_Wtime();

            MPI_Allgatherv(sendbuf, size, MPI_CHAR, recvbuf, recvcounts, rdispls, MPI_CHAR, MPI_COMM_WORLD);
        
            t_stop = MPI_Wtime();

            if(i >= skip) {
                timer+= t_stop-t_start;
            }
            MPI_Barrier(MPI_COMM_WORLD);
 
        }
        
        MPI_Barrier(MPI_COMM_WORLD);

        latency = (double)(timer * 1e6) / iterations;

        MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0, 
                MPI_COMM_WORLD); 
        MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, 
                MPI_COMM_WORLD); 
        MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0, 
                MPI_COMM_WORLD); 
        avg_time = avg_time/numprocs; 

        print_data(rank, full, size, avg_time, min_time, max_time, iterations);
        MPI_Barrier(MPI_COMM_WORLD);
    }
    
    free(s_buf1);
    free(r_buf1);

    free(recvcounts);
    free(rdispls);

    MPI_Finalize();

    return EXIT_SUCCESS;
}
Пример #10
0
int main(int argc, char **argv)
{

    int size , rank, number; 

    MPI_Init(&argc, &argv); 
    MPI_Comm_size(MPI_COMM_WORLD , &size);
    MPI_Comm_rank(MPI_COMM_WORLD , &rank);

    double start =  MPI_Wtime(); //Starter klokka
    double umaxglob=0; //Max feil for alle tråder

    if (argc < 2) {
        printf("Usage:\n");
        printf("  poisson n\n\n");
        printf("Arguments:\n");
        printf("  n: the problem size (must be a power of 2)\n");
    }

    // The number of grid points in each direction is n+1
    // The number of degrees of freedom in each direction is n-1
    int n = atoi(argv[1]);
    
    int m = n - 1;  // ant punk hver vei i B

    int *cnt = (int *) malloc(size * sizeof(int)); //loakal ant kolonner i matrix
    int *displs = (int *) malloc((size+1) * sizeof(int)); //lokal displacement for de andre prosessorene sine punkter i sendbuf
    displs[size] = m;
    displs[0]=0; //Displacement til første prosessor er alltid 0


   int  overflow = m % size; //ant kolonner som blir til overs 

    for(int i = 0;i<size;i++){
        cnt[i] = m / size; // nrColon for hver prosessor  
        if (overflow != 0){ 
            cnt[i]++; //fordeler de ekstra kolonnene
            overflow--;
        }
        if (i < size-1){
            displs[i+1] = displs[i]+cnt[i];
        }

    }
 
    int nrColon = cnt[rank]; //ant kolonner "jeg" har
    int pros_dof = nrColon*m;  //ant lementer jeg har


    int nn = 4 * n;
    double h = 1.0 / n;


    // Grid points
    double *grid = mk_1D_array(n+1, false);
    double **b = mk_2D_array(nrColon, m, false);
    double **bt = mk_2D_array(nrColon, m,false);

    int trad = omp_get_max_threads(); //ant tråder 
    double **z = mk_2D_array(trad,nn, false); //z er 2D pga paralellisering med OpenMP, da FST ikke skal overskrive andre tråders z

    double *diag = mk_1D_array(m, false);     
    double *sendbuf = mk_1D_array(nrColon*m, false);
    double *recbuf = mk_1D_array(nrColon*m, false); 


    int *sendcnt = (int *) malloc((size+1) * sizeof(int)); //ant elementer jeg skal sende hver prosessor 
    int *sdispls = (int *) malloc((size+1) * sizeof(int)); //index i sendbuf for hver prosessor

 

    sdispls[0]=0; //prosessor 0 skal alltid ha fra index 0
    for(int i = 0;i<size;i++){
        sendcnt[i] = cnt[i]*cnt[rank]; //  antt elementer jeg eier * ant element den eier
        sdispls[i] = displs[i]*cnt[rank]; //displacement for hver prosessor
    }

    // GRID
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n+1; i++) {
        grid[i] = i * h;
    }




    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < m; i++) {
        diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); //Eigenvalue
      }

  // Initialize the right hand side data 
    
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        for (size_t j = 0; j < m; j++) {
        //  b[i][j] = h * h;
            b[i][j] = h * h * func1(grid[i+displs[rank]], grid[j]); //evaluerer funksjoen * h*h
        }
    }

    // Calculate Btilde^T = S^-1 * (S * B)^T 
 
    #pragma omp parallel for schedule(guided, 5)
    for (size_t i = 0; i < nrColon; i++) {
        fst_(b[i], &n, z[omp_get_thread_num()], &nn);
    }
    MPItranspose (b, bt,nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fstinv_(bt[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Solve Lambda * Xtilde = Btilde

    #pragma omp parallel for schedule(static)

    for (int j=0; j < nrColon; j++) {
       for (int i=0; i < m; i++) {
            bt[j][i] /= (diag[j+displs[rank]]+diag[i]);
        }
    }

    // Calculate X = S^-1 * (S * Xtilde^T)
    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fst_(bt[i], &n, z[omp_get_thread_num()], &nn);

    }
    MPItranspose (bt, b, nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs);

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        fstinv_(b[i], &n, z[omp_get_thread_num()], &nn);
    }

    // Calculate maximal value of solution
    double u_max = 0.0, temp;

    #pragma omp parallel for schedule(static)
    for (size_t i = 0; i < nrColon; i++) {
        for (size_t j = 0; j < m; j++) {
            temp = b[i][j] - func2(grid[displs[rank]+i], grid[j]);  //tester resultat - kjent funksjon, skal bli = 0
            if (temp > u_max){
                u_max = temp;
            }
        }
    }
    MPI_Reduce (&u_max, &umaxglob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); //Finner den største u_max fra de forskjellige prosessorene og setter den til umaxglob 

    MPI_Finalize();

    if (rank == 0) {
        printf("Nodes = %d \n", size);
        printf("Threads per node = %d \n", omp_get_max_threads());
        printf("u_max = %e\n", umaxglob);  //Printer max feil
        double times = MPI_Wtime()-start; //Stopper klokka
        printf("Time elapsed = %1.16f \n", times); //Pinter tid
    }
    return 0;
}
Пример #11
0
void PetscVector<Complex>::localize_to_one (std::vector<Complex>& v_local,
					    const processor_id_type pid) const
{
  this->_restore_array();

  PetscErrorCode ierr=0;
  const PetscInt n  = size();
  const PetscInt nl = local_size();
  PetscScalar *values;


  v_local.resize(n);


  for (PetscInt i=0; i<n; i++)
    v_local[i] = 0.;

  // only one processor
  if (n == nl)
    {
      ierr = VecGetArray (_vec, &values);
	     CHKERRABORT(libMesh::COMM_WORLD,ierr);

      for (PetscInt i=0; i<n; i++)
	v_local[i] = static_cast<Complex>(values[i]);

      ierr = VecRestoreArray (_vec, &values);
	     CHKERRABORT(libMesh::COMM_WORLD,ierr);
    }

  // otherwise multiple processors
  else
    {
      numeric_index_type ioff = this->first_local_index();

      /* in here the local values are stored, acting as send buffer for MPI
       * initialize to zero, since we collect using MPI_SUM
       */
      std::vector<Real> real_local_values(n, 0.);
      std::vector<Real> imag_local_values(n, 0.);

      {
	ierr = VecGetArray (_vec, &values);
	       CHKERRABORT(libMesh::COMM_WORLD,ierr);

	// provide my local share to the real and imag buffers
	for (PetscInt i=0; i<nl; i++)
	  {
	    real_local_values[i+ioff] = static_cast<Complex>(values[i]).real();
	    imag_local_values[i+ioff] = static_cast<Complex>(values[i]).imag();
	  }

	ierr = VecRestoreArray (_vec, &values);
	       CHKERRABORT(libMesh::COMM_WORLD,ierr);
      }

      /* have buffers of the real and imaginary part of v_local.
       * Once MPI_Reduce() collected all the real and imaginary
       * parts in these std::vector<Real>, the values can be
       * copied to v_local
       */
      std::vector<Real> real_v_local(n);
      std::vector<Real> imag_v_local(n);

      // collect entries from other proc's in real_v_local, imag_v_local
      MPI_Reduce (&real_local_values[0], &real_v_local[0], n,
		  MPI_REAL, MPI_SUM,
		  pid, libMesh::COMM_WORLD);

      MPI_Reduce (&imag_local_values[0], &imag_v_local[0], n,
		  MPI_REAL, MPI_SUM,
		  pid, libMesh::COMM_WORLD);

      // copy real_v_local and imag_v_local to v_local
      for (PetscInt i=0; i<n; i++)
	v_local[i] = Complex(real_v_local[i], imag_v_local[i]);
    }
}
Пример #12
0
void grad(int *n, int nthreads) {
  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  double *data;
  Complex *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm);

  //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  data=(double*)accfft_alloc(alloc_max);
  data_hat=(Complex*)accfft_alloc(alloc_max);

  accfft_init(nthreads);

  /* Create FFT plan */
  setup_time=-MPI_Wtime();
  accfft_plan * plan=accfft_plan_dft_3d_r2c(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE);
  setup_time+=MPI_Wtime();


  /*  Initialize data */
  initialize(data,n,c_comm);
  MPI_Barrier(c_comm);

  double * gradx=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * grady=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double * gradz=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double));
  double timings[5]={0};

  std::bitset<3> XYZ=0;
  XYZ[0]=1;
  XYZ[1]=1;
  XYZ[2]=1;
  double exec_time=-MPI_Wtime();
  accfft_grad(gradx,grady,gradz,data,plan,XYZ,timings);
  exec_time+=MPI_Wtime();
  /* Check err*/
  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl;
  check_err_grad(gradx,n,c_comm,0);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl;
  check_err_grad(grady,n,c_comm,1);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl;
  PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl;
  check_err_grad(gradz,n,c_comm,2);
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl;
  PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl;

  /* Compute some timings statistics */
  double g_setup_time,g_timings[5],g_exec_time;

  MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);
  MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm);

  PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
  PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl;
  PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl;

  accfft_free(data);
  accfft_free(data_hat);
  MPI_Barrier(c_comm);
  accfft_free(gradx);
  accfft_free(grady);
  accfft_free(gradz);
  accfft_destroy_plan(plan);
  accfft_cleanup();
  MPI_Comm_free(&c_comm);
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------"<<std::endl;
  PCOUT<<"-------------------------------------------------------\n"<<std::endl;
  return ;

} // end grad
Пример #13
0
int
main(int argc, char *argv[])
{
    GRID *g;
    DOF *u_h;
    MAT *A, *A0, *B;
    MAP *map;
    INT i;
    size_t nnz, mem, mem_peak;
    VEC *x, *y0, *y1, *y2;
    double t0, t1, dnz, dnz1, mflops, mop;
    char *fn = "../test/cube.dat";
    FLOAT mem_max = 300;
    INT refine = 0;

    phgOptionsRegisterFilename("-mesh_file", "Mesh file", (char **)&fn);
    phgOptionsRegisterInt("-loop_count", "Loop count", &loop_count);
    phgOptionsRegisterInt("-refine", "Refinement level", &refine);
    phgOptionsRegisterFloat("-mem_max", "Maximum memory", &mem_max);

    phgInit(&argc, &argv);
    g = phgNewGrid(-1);
    if (!phgImport(g, fn, FALSE))
	phgError(1, "can't read file \"%s\".\n", fn);
    phgRefineAllElements(g, refine);
    u_h = phgDofNew(g, DOF_DEFAULT, 1, "u_h", DofNoAction);

    while (TRUE) {
	phgPrintf("\n");
	if (phgBalanceGrid(g, 1.2, 1, NULL, 0.))
	    phgPrintf("Repartition mesh, %d submeshes, load imbalance: %lg\n",
			g->nprocs, (double)g->lif);
	map = phgMapCreate(u_h, NULL);
	A = phgMapCreateMat(map, map);
	A->handle_bdry_eqns = TRUE;
	build_matrix(A, u_h);
	phgMatAssemble(A);

	/* Note: A is unsymmetric (A' != A) if boundary entries not removed */
	phgMatRemoveBoundaryEntries(A);

#if 0
	/* test block matrix operation */
	A0 = phgMatCreateBlockMatrix(g->comm, 1, 1, &A, NULL);
#else
	A0 = A;
#endif

	phgPrintf("%d DOF, %d elems, %d submeshes, matrix size: %d, LIF: %lg\n",
			DofGetDataCountGlobal(u_h), g->nleaf_global,
			g->nprocs, A->rmap->nglobal, (double)g->lif);

	/* test PHG mat-vec multiply */
	x = phgMapCreateVec(A->cmap, 1);
	y1 = phgMapCreateVec(A->rmap, 1);
	phgVecRandomize(x, 123);
	phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1);

	phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	t0 = phgGetTime(NULL);
	for (i = 0; i < loop_count; i++) {
	    phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1);
	}
	t1 = phgGetTime(NULL);
	mflops = phgPerfGetMflops(g, NULL, NULL);
	y0 = phgVecCopy(y1, NULL);
	nnz = A->nnz_d + A->nnz_o;
#if USE_MPI
	dnz1 = nnz;
	MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm);
#else
	dnz = nnz;
#endif
	mop = loop_count * (dnz + dnz - A->rmap->nlocal) * 1e-6;

	phgPrintf("\n");
	t1 -= t0;
	phgPrintf("   PHG:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF)\n",
			t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops);

	/* test trans(A)*x */
	phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	t0 = phgGetTime(NULL);
	for (i = 0; i < loop_count; i++) {
	    phgMatVec(MAT_OP_T, 1.0, A0, x, 0.0, &y1);
	}
	t1 = phgGetTime(NULL);
	mflops = phgPerfGetMflops(g, NULL, NULL);
	t1 -= t0;
	phgPrintf("  A'*x:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
		  "err: %le\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops,
		 (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));

	/* time A * trans(A) */
	phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	t0 = phgGetTime(NULL);
	B = phgMatMat(MAT_OP_N, MAT_OP_N, 1.0, A, A, 0.0, NULL);
	t1 = phgGetTime(NULL);
	mflops = phgPerfGetMflops(g, NULL, NULL);
	nnz = B->nnz_d + B->nnz_o;
#if USE_MPI
	dnz1 = nnz;
	MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm);
#else
	dnz = nnz;
#endif
	/* compare B*x <--> A*A*x */
	y2 = phgMatVec(MAT_OP_N, 1.0, B, x, 0.0, NULL);
	phgMatVec(MAT_OP_N, 1.0, A0, y0, 0.0, &y1);
	phgMatDestroy(&B);
	t1 -= t0;
	phgPrintf("   A*A:  time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n",
		  t1, dnz, mflops,
		 (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL));

#if USE_PETSC
	{
	    Mat ma, mb;
	    MatInfo info;
	    Vec va, vb, vc;
	    PetscScalar *vec;

	    ma = phgPetscCreateMatAIJ(A);
	    MatGetVecs(ma, PETSC_NULL, &va);
	    VecDuplicate(va, &vb);
	    VecGetArray(va, &vec);
	    memcpy(vec, x->data, x->map->nlocal * sizeof(*vec));
	    VecRestoreArray(va, &vec);
	    MatMult(ma, va, vb);
	    phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	    t0 = phgGetTime(NULL);
	    for (i = 0; i < loop_count; i++) {
		MatMult(ma, va, vb);
	    }
	    t1 = phgGetTime(NULL);
	    mflops = phgPerfGetMflops(g, NULL, NULL);
	    VecGetArray(vb, &vec);
	    memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec));
	    VecRestoreArray(vb, &vec);

	    MatGetInfo(ma, MAT_GLOBAL_SUM, &info);
	    /*phgPrintf("    --------------------------------------------"
		      "-------------------------\n");*/
	    phgPrintf("\n");
	    t1 -= t0;
	    dnz = info.nz_used;
	    phgPrintf(" PETSc:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
		      "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops,
		 (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));

	    phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	    t0 = phgGetTime(NULL);
	    for (i = 0; i < loop_count; i++) {
		MatMultTranspose(ma, va, vb);
	    }
	    t1 = phgGetTime(NULL);
	    mflops = phgPerfGetMflops(g, NULL, NULL);
	    VecGetArray(vb, &vec);
	    memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec));
	    VecRestoreArray(vb, &vec);
	    t1 -= t0;
	    phgPrintf("  A'*x:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
		      "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops,
		(double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));

	    phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	    t0 = phgGetTime(NULL);
	    MatMatMult(ma, ma, MAT_INITIAL_MATRIX, PETSC_DEFAULT, &mb);
	    t1 = phgGetTime(NULL);
	    mflops = phgPerfGetMflops(g, NULL, NULL);
	    t1 -= t0;
	    MatGetInfo(mb, MAT_GLOBAL_SUM, &info);
	    dnz = info.nz_used;
	    VecDuplicate(va, &vc);
	    /* compare B*x <--> A*A*x */
	    MatMult(ma, vb, vc);
	    MatMult(mb, va, vb);
	    VecGetArray(vb, &vec);
	    memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec));
	    VecRestoreArray(vb, &vec);
	    VecGetArray(vc, &vec);
	    memcpy(y2->data, vec, x->map->nlocal * sizeof(*vec));
	    VecRestoreArray(vc, &vec);
	    phgPrintf("   A*A:  time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n",
		  t1, dnz, mflops,
		 (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL));

	    phgPetscMatDestroy(&mb);
	    phgPetscMatDestroy(&ma);
	    phgPetscVecDestroy(&va);
	    phgPetscVecDestroy(&vb);
	    phgPetscVecDestroy(&vc);
	}
#endif	/* USE_PETSC */

#if USE_HYPRE
	{
	    HYPRE_IJMatrix ma;
	    HYPRE_IJVector va, vb, vc;
	    HYPRE_ParCSRMatrix  par_ma;
	    hypre_ParCSRMatrix  *par_mb;
	    HYPRE_ParVector	par_va, par_vb, par_vc;
	    HYPRE_Int offset, *ni, start, end;
	    assert(sizeof(INT)==sizeof(int) && sizeof(FLOAT)==sizeof(double));
	    setup_hypre_mat(A, &ma);
	    ni = phgAlloc(2 * A->rmap->nlocal * sizeof(*ni));
	    offset = A->cmap->partition[A->cmap->rank];
	    for (i = 0; i < A->rmap->nlocal; i++)
		ni[i] = i + offset;
	    HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1,
				 &va);
	    HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1,
				 &vb);
	    HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1,
				 &vc);
	    HYPRE_IJVectorSetObjectType(va, HYPRE_PARCSR);
	    HYPRE_IJVectorSetObjectType(vb, HYPRE_PARCSR);
	    HYPRE_IJVectorSetObjectType(vc, HYPRE_PARCSR);
	    HYPRE_IJVectorSetMaxOffProcElmts(va, 0);
	    HYPRE_IJVectorSetMaxOffProcElmts(vb, 0);
	    HYPRE_IJVectorSetMaxOffProcElmts(vc, 0);
	    HYPRE_IJVectorInitialize(va);
	    HYPRE_IJVectorInitialize(vb);
	    HYPRE_IJVectorInitialize(vc);
	    HYPRE_IJMatrixGetObject(ma, (void **)(void *)&par_ma);
	    HYPRE_IJVectorGetObject(va, (void **)(void *)&par_va);
	    HYPRE_IJVectorGetObject(vb, (void **)(void *)&par_vb);
	    HYPRE_IJVectorGetObject(vc, (void **)(void *)&par_vc);
	    HYPRE_IJVectorSetValues(va, A->cmap->nlocal, ni, (double *)x->data);
	    HYPRE_IJVectorAssemble(va);
	    HYPRE_IJVectorAssemble(vb);
	    HYPRE_IJVectorAssemble(vc);

	    HYPRE_IJMatrixGetRowCounts(ma, A->cmap->nlocal,
					ni, ni + A->rmap->nlocal);
	    for (i = 0, nnz = 0; i < A->rmap->nlocal; i++)
		nnz += ni[A->rmap->nlocal + i];
#if USE_MPI
	    dnz1 = nnz;
	    MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm);
#else
	    dnz = nnz;
#endif

	    HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_va, 0.0, par_vb);
	    phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	    t0 = phgGetTime(NULL);
	    for (i = 0; i < loop_count; i++) {
		HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_va, 0.0, par_vb);
	    }
	    t1 = phgGetTime(NULL);
	    mflops = phgPerfGetMflops(g, NULL, NULL);
	    HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data);
	    /*phgPrintf("    --------------------------------------------"
		      "-------------------------\n");*/
	    phgPrintf("\n");
	    t1 -= t0;
	    phgPrintf(" HYPRE:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
		      "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops,
		(double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));

	    phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	    t0 = phgGetTime(NULL);
	    for (i = 0; i < loop_count; i++) {
		HYPRE_ParCSRMatrixMatvecT(1.0, par_ma, par_va, 0.0, par_vb);
	    }
	    t1 = phgGetTime(NULL);
	    mflops = phgPerfGetMflops(g, NULL, NULL);
	    HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data);
	    t1 -= t0;
	    phgPrintf("  A'*x:  time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), "
		      "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops,
		(double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL));

	    phgPerfGetMflops(g, NULL, NULL);	/* reset flops counter */
	    t0 = phgGetTime(NULL);
	    /* Note: 'HYPRE_ParCSRMatrix' is currently typedef'ed to
	     *	     'hypre_ParCSRMatrix *' */
	    par_mb = hypre_ParMatmul((hypre_ParCSRMatrix *)par_ma,
					(hypre_ParCSRMatrix *)par_ma);
	    t1 = phgGetTime(NULL);
	    mflops = phgPerfGetMflops(g, NULL, NULL);
	    start = hypre_ParCSRMatrixFirstRowIndex(par_mb);
	    end = hypre_ParCSRMatrixLastRowIndex(par_mb) + 1;
	    for (i = start, nnz = 0; i < end; i++) {
		HYPRE_Int ncols;
		hypre_ParCSRMatrixGetRow(par_mb, i, &ncols, NULL, NULL);
		hypre_ParCSRMatrixRestoreRow(par_mb, i, &ncols, NULL, NULL);
		nnz += ncols;
	    }
#if USE_MPI
	    dnz1 = nnz;
	    MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm);
#else
	    dnz = nnz;
#endif
	    /* compare B*x <--> A*A*x */
	    HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_vb, 0.0, par_vc);
	    HYPRE_ParCSRMatrixMatvec(1.0, (void *)par_mb, par_va, 0.0, par_vb);
	    HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data);
	    HYPRE_IJVectorGetValues(vc, A->rmap->nlocal, ni, (double*)y2->data);
	    hypre_ParCSRMatrixDestroy((par_mb));
	    t1 -= t0;
	    phgPrintf("   A*A:  time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n",
		  t1, dnz, mflops,
		 (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL));

	    phgFree(ni);
	    HYPRE_IJMatrixDestroy(ma);
	    HYPRE_IJVectorDestroy(va);
	    HYPRE_IJVectorDestroy(vb);
	    HYPRE_IJVectorDestroy(vc);
	}
#endif	/* USE_HYPRE */

	if (A0 != A)
	    phgMatDestroy(&A0);
#if 0
if (A->rmap->nglobal > 1000) {
    VEC *v = phgMapCreateVec(A->rmap, 3);
    for (i = 0; i < v->map->nlocal; i++) {
	v->data[i + 0 * v->map->nlocal] = 1 * (i + v->map->partition[g->rank]);
	v->data[i + 1 * v->map->nlocal] = 2 * (i + v->map->partition[g->rank]);
	v->data[i + 2 * v->map->nlocal] = 3 * (i + v->map->partition[g->rank]);
    }
    phgMatDumpMATLAB(A, "A", "A.m");
    phgVecDumpMATLAB(v, "v", "v.m");
    phgFinalize();
    exit(0);
}
#endif
	phgMatDestroy(&A);
	phgVecDestroy(&x);
	phgVecDestroy(&y0);
	phgVecDestroy(&y1);
	phgVecDestroy(&y2);
	phgMapDestroy(&map);
	mem = phgMemoryUsage(g, &mem_peak);
	dnz = mem / (1024.0 * 1024.0);
	dnz1 = mem_peak / (1024.0 * 1024.0);
	/*phgPrintf("    --------------------------------------------"
		  "-------------------------\n");*/
	phgPrintf("\n");
	phgPrintf("  Memory: current %0.4lgMB, peak %0.4lgMB\n", dnz, dnz1);
#if 0
{
    static int loop_count = 0;
    if (++loop_count == 4)
	break;
}
#endif
	if (mem_peak > 1024 * (size_t)1024 * mem_max)
	    break;
	phgRefineAllElements(g, 1);
    }
    phgDofFree(&u_h);
    phgFreeGrid(&g);
    phgFinalize();

    return 0;
}
Пример #14
0
void Master::collectStats() {
        int64_t dummy = 0;
        MPI_Reduce(&dummy, &engine.conflicts, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&dummy, &engine.propagations, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD);
        MPI_Reduce(&dummy, &engine.opt_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
}
Пример #15
0
int main(int argc, char ** argv)
{
  int Block_order;
  size_t Block_size;
  size_t Colblock_size;
  int Tile_order=32;
  int tiling;
  int Num_procs;     /* Number of ranks                                          */
  int order;         /* overall matrix order                                     */
  int send_to, recv_from; /* communicating ranks                                 */
  size_t bytes;      /* total amount of data to be moved                         */
  int my_ID;         /* rank                                                     */
  int root=0;        /* root rank of a communicator                              */
  int iterations;    /* number of times to run the pipeline algorithm            */
  int i, j, it, jt, ID;/* dummies                                                */
  int iter;          /* index of iteration                                       */
  int phase;         /* phase in the staged communication                        */
  size_t colstart;   /* sequence number of first column owned by calling rank    */
  int error=0;       /* error flag                                               */
  double *A_p;       /* original matrix column block                             */
  double *B_p;       /* transposed matrix column block                           */
  double *Work_in_p; /* workspace for the transpose function                     */
  double *Work_out_p;/* workspace for the transpose function                     */
  double abserr, abserr_tot; /* computed error                                   */
  double epsilon = 1.e-8; /* error tolerance                                     */
  double local_trans_time, /* timing parameters                                  */
         trans_time,
         avgtime;
  MPI_Status status; /* completion status of message                             */
  MPI_Win shm_win_A; /* Shared Memory window object                              */
  MPI_Win shm_win_B; /* Shared Memory window object                              */
  MPI_Win shm_win_Work_in; /* Shared Memory window object                        */
  MPI_Win shm_win_Work_out; /* Shared Memory window object                       */
  MPI_Info rma_winfo;/* info for window                                          */
  MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator                      */
  MPI_Comm shm_comm; /* Shared Memory Communicator                               */
  int shm_procs;     /* # of ranks in shared domain                              */
  int shm_ID;        /* MPI rank within coherence domain                         */
  int group_size;    /* number of ranks per shared memory group                  */
  int Num_groups;    /* number of shared memory group                            */
  int group_ID;      /* sequence number of shared memory group                   */
  int size_mul;      /* size multiplier; 0 for non-root ranks in coherence domain*/
  int istart;
  MPI_Request send_req, recv_req;

/*********************************************************************************
** Initialize the MPI environment
**********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  root = 0;

/*********************************************************************
** process, test and broadcast input parameter
*********************************************************************/

  if (my_ID == root){
    if (argc != 4 && argc !=5){
      printf("Usage: %s  <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    } 
    if (Num_procs%group_size) {
      printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    } 

    iterations = atoi(*++argv);
    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    } 

    order = atoi(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc == 5) Tile_order = atoi(*++argv);

    ENDOFTESTS:;
  }
  bail_out(error); 

  /*  Broadcast input data to all ranks */
  MPI_Bcast(&order,      1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM Matrix transpose: B = A^T\n");
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Rank group size      = %d\n", group_size);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
       printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
#ifndef SYNCHRONOUS
    printf("Non-");
#endif
    printf("Blocking messages\n");
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that a SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Num_groups     = Num_procs/group_size;
  Block_order    = order/Num_groups;

  group_ID       = my_ID/group_size;
  colstart       = Block_order * group_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the column block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/

  /* RMA win info */
  MPI_Info_create(&rma_winfo);
  /* This key indicates that passive target RMA will not be used.
   * It is the one info key that MPICH actually uses for optimization. */
  MPI_Info_set(rma_winfo, "no_locks", "true");

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);  
  int offset = 32;
  MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit;
  MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, 
                          (void *) &A_p, &shm_win_A);
  MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p);
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);
  A_p += offset;

  MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, 
                          (void *) &B_p, &shm_win_B);
  MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p);
  if (B_p == NULL){
    printf(" Error allocating space for transposed matrix by group %d\n",group_ID);
    error = 1;
  }
  bail_out(error);
  B_p += offset;

  if (Num_groups>1) {

    size = Block_size*sizeof(double)*size_mul;
    MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, 
                           (void *) &Work_in_p, &shm_win_Work_in);
    MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, 
                         (void *)&Work_in_p);
    if (Work_in_p == NULL){
      printf(" Error allocating space for in block by group %d\n",group_ID);
      error = 1;
    }
    bail_out(error);

    MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, 
                            shm_comm, (void *) &Work_out_p, &shm_win_Work_out);
    MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, 
                         (void *)&Work_out_p);
    if (Work_out_p == NULL){
      printf(" Error allocating space for out block by group %d\n",group_ID);
      error = 1;
    }
    bail_out(error);
  }

  /* Fill the original column matrix                                             */
  istart = 0;  
  int chunk_size = Block_order/group_size;
  if (tiling) {
      for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) {
      for (i=0;i<order; i+=Tile_order) 
        for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++)
          for (it=i; it<MIN(order,i+Tile_order); it++) {
            A(it,jt) = (double) (order*(jt+colstart) + it);
            B(it,jt) = -1.0;
          }
    }
  }
  else {
    for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) 
      for (i=0;i<order; i++) {
        A(i,j) = (double) (order*(j+colstart) + i);
        B(i,j) = -1.0;
      }
  }
  /* NEED A STORE FENCE HERE                                                     */
  MPI_Barrier(shm_comm);

  for (iter=0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                    */
    istart = colstart; 
    if (!tiling) {
      for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) {
        for (j=0; j<Block_order; j++) 
              B(j,i) = A(i,j);
	}
    }
    else {
      for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) {
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) = A(it,jt); 
	    }
      }
    }

    for (phase=1; phase<Num_groups; phase++){
      recv_from = ((group_ID + phase             )%Num_groups);
      send_to   = ((group_ID - phase + Num_groups)%Num_groups);

#ifndef SYNCHRONOUS
      if (shm_ID==0) {
         MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, 
                   recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req);  
      }
#endif

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
	  }
      }
      else {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order)
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
	      }
      }

      /* NEED A LOAD/STORE FENCE HERE                                            */
      MPI_Barrier(shm_comm);
      if (shm_ID==0) {
#ifndef SYNCHRONOUS  
        MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size,
                  phase, MPI_COMM_WORLD, &send_req);
        MPI_Wait(&recv_req, &status);
        MPI_Wait(&send_req, &status);
#else
        MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, 
                     phase, Work_in_p, Block_size, MPI_DOUBLE, 
  	             recv_from*group_size, phase, MPI_COMM_WORLD, &status);
#endif
      }
      /* NEED A LOAD FENCE HERE                                                  */ 
      MPI_Barrier(shm_comm);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) = Work_in(i,j);

    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  /*  for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */
  for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
    for (i=0;i<order; i++) { 
      abserr += ABS(B(i,j) - (double)(order*i + j+colstart));
    }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr_tot);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  MPI_Win_free(&shm_win_A);
  MPI_Win_free(&shm_win_B);
  if (Num_groups>1) {
      MPI_Win_free(&shm_win_Work_in);
      MPI_Win_free(&shm_win_Work_out);
  }

  MPI_Info_free(&rma_winfo);

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
Пример #16
0
int main(int argc, char *argv[])
{
	bool wantwf, verb;

	int ix, iz, is, it, wfit, im, ik, i, j, itau;
    int ns, nx, nz, nt, wfnt, rnx, rnz, nzx, rnzx, vnx, ntau, htau, nds;
	int scalet, snap, snapshot, fnx, fnz, fnzx, nk, nb;
	int rectx, rectz, gpz, n, m, pad1, trunc, spx, spz;

	float dt, t0, z0, dz, x0, dx, s0, ds, wfdt, srctrunc;
    float dtau, tau0, tau;

	char *path1, *path2, number[5], *left, *right;

	double tstart, tend;
	struct timeval tim;
	
	sf_complex c, **lt, **rt;
	sf_complex *ww, **dd;
	float ***img1, **img2, ***mig1, **mig2;
    float *rr, **ccr, **sill, ***fwf, ***bwf;
	sf_complex *cwave, *cwavem, **wave, *curr;

	sf_axis at, ax, az, atau;

	sf_file Fdat, Fsrc, Fimg1, Fimg2;
	sf_file Ffwf, Fbwf, Fvel;
	sf_file Fleft, Fright;

	int cpuid, numprocs, nth;
    float *sendbuf, *recvbuf;
	MPI_Comm comm=MPI_COMM_WORLD;

	MPI_Init(&argc, &argv);
	MPI_Comm_rank(comm, &cpuid);
	MPI_Comm_size(comm, &numprocs);

	sf_init(argc, argv);

#ifdef _OPENMP
#pragma omp parallel
	{
		nth=omp_get_num_threads();
	}
	sf_warning(">>> Using %d threads <<<", nth);
#endif

	gettimeofday(&tim, NULL);
	tstart=tim.tv_sec+(tim.tv_usec/1000000.0);

	if(!sf_getbool("wantwf", &wantwf)) wantwf=false;
    if(!sf_getbool("verb", &verb)) verb=false;
	if(!sf_getint("pad1", &pad1)) pad1=1;
	/* padding factor on the first axis */

	if(!sf_getint("nb", &nb)) sf_error("Need nb= ");
	if(!sf_getfloat("srctrunc", &srctrunc)) srctrunc=0.4;
	if(!sf_getint("rectx", &rectx)) rectx=2;
	if(!sf_getint("rectz", &rectz)) rectz=2;

	if(!sf_getint("scalet", &scalet)) scalet=1;
	if(!sf_getint("snap", &snap)) snap=100;
	/* interval of the output wavefield */
	if(!sf_getint("snapshot", &snapshot)) snapshot=0;
	/* print out the wavefield snapshots of this shot */
    if(!sf_getint("nds", &nds)) sf_error("Need nds=!");
    
    /* source and receiver positions */
	if(!sf_getint("gpz", &gpz)) sf_error("Need gpz=");
	if(!sf_getint("spx", &spx)) sf_error("Need spx=");
	if(!sf_getint("spz", &spz)) sf_error("Need spz=");
    
    /* tau parameters */
    if(!sf_getint("ntau", &ntau)) sf_error("Need ntau=");
    if(!sf_getfloat("dtau", &dtau)) sf_error("Need dtau=");
    if(!sf_getfloat("tau0", &tau0)) sf_error("Need tau0=");

	/* input/output files */
	Fdat=sf_input("in");
	Fimg1=sf_output("out");
    Fimg2=sf_output("Fimg2");
    Fsrc=sf_input("Fsrc");
    Fvel=sf_input("Fpadvel");

	if(wantwf){
		Ffwf=sf_output("Ffwf");
        Fbwf=sf_output("Fbwf");
	}

	at=sf_iaxa(Fsrc, 1); nt=sf_n(at); dt=sf_d(at); t0=sf_o(at);
    ax=sf_iaxa(Fvel, 2); vnx=sf_n(ax); x0=sf_o(ax);
	az=sf_iaxa(Fvel, 1); rnz=sf_n(az); dz=sf_d(az); z0=sf_o(az);
    if(!sf_histint(Fdat, "n2", &rnx)) sf_error("Need n2= in input!");
    if(!sf_histfloat(Fdat, "d2", &dx)) sf_error("Need d2= in input!");
    if(!sf_histint(Fdat, "n3", &ns)) sf_error("Need n3= in input!");
    if(!sf_histfloat(Fdat, "d3", &ds)) sf_error("Need d3= in input!");
    if(!sf_histfloat(Fdat, "o3", &s0)) sf_error("Need o3= in input!");
    
    wfnt=(nt-1)/scalet+1;
    wfdt=dt*scalet;
    
    /* double check the geometry parameters */
    if(nds != (int)(ds/dx)) sf_error("Need ds/dx= %d", nds);
	sf_warning("s0=%g, x0+(rnx-1)*dx/2=%g", s0, x0+(rnx-1)*dx/2);
    //if(s0 != x0+(rnx-1)*dx/2) sf_error("Wrong origin information!");
    if(vnx != nds*(ns-1)+rnx) sf_error("Wrong dimension in x axis!");

    /* set up the output files */
    atau=sf_iaxa(Fsrc, 1);
    sf_setn(atau, ntau);
    sf_setd(atau, dtau);
    sf_seto(atau, tau0);
    sf_setlabel(atau, "Tau");
    sf_setunit(atau, "s");
    
    sf_oaxa(Fimg1, az, 1);
    sf_oaxa(Fimg1, ax, 2);
    sf_oaxa(Fimg1, atau, 3);
    sf_oaxa(Fimg2, az, 1);
    sf_oaxa(Fimg2, ax, 2);
    sf_putint(Fimg2, "n3", 1);
    sf_settype(Fimg1, SF_FLOAT);
    sf_settype(Fimg2, SF_FLOAT);
    
    if(wantwf){
		sf_setn(ax, rnx);
        sf_seto(ax, -(rnx-1)*dx/2.0);
        sf_oaxa(Ffwf, az, 1);
        sf_oaxa(Ffwf, ax, 2);
        sf_putint(Ffwf, "n3", (wfnt-1)/snap+1);
        sf_putfloat(Ffwf, "d3", snap*wfdt);
        sf_putfloat(Ffwf, "o3", t0);
        sf_putstring(Ffwf, "label3", "Time");
        sf_putstring(Ffwf, "unit3", "s");
        sf_settype(Ffwf, SF_FLOAT);
        
        sf_oaxa(Fbwf, az, 1);
        sf_oaxa(Fbwf, ax, 2);
        sf_putint(Fbwf, "n3", (wfnt-1)/snap+1);
        sf_putfloat(Fbwf, "d3", -snap*wfdt);
        sf_putfloat(Fbwf, "o3", (wfnt-1)*wfdt);
        sf_putstring(Fbwf, "label3", "Time");
        sf_putstring(Fbwf, "unit3", "s");
        sf_settype(Fbwf, SF_FLOAT);
	}
	
    nx=rnx+2*nb; nz=rnz+2*nb;
	nzx=nx*nz; rnzx=rnz*rnx;
    nk=cfft2_init(pad1, nz, nx, &fnz, &fnx);
	fnzx=fnz*fnx;
    
	/* print axies parameters for double check */
    sf_warning("cpuid=%d, numprocs=%d", cpuid, numprocs);
	sf_warning("nt=%d, dt=%g, scalet=%d, wfnt=%d, wfdt=%g",nt, dt, scalet, wfnt, wfdt);
	sf_warning("vnx=%d, nx=%d, dx=%g, nb=%d, rnx=%d", vnx, nx, dx, nb, rnx);
	sf_warning("nz=%d, rnz=%d, dz=%g, z0=%g", nz, rnz, dz, z0);
	sf_warning("spx=%d, spz=%d, gpz=%d", spx, spz, gpz);
	sf_warning("ns=%d, ds=%g, s0=%g", ns, ds, s0);
    sf_warning("ntau=%d, dtau=%g, tau0=%g", ntau, dtau, tau0);
    sf_warning("nzx=%d, fnzx=%d, nk=%d", nzx, fnzx, nk);

	/* allocate storage and read data */
	ww=sf_complexalloc(nt);
	sf_complexread(ww, nt, Fsrc);
	sf_fileclose(Fsrc);
	
    gpz=gpz+nb;
    spz=spz+nb;
    spx=spx+nb;
    trunc=srctrunc/dt+0.5;
    
	dd=sf_complexalloc2(nt, rnx);
	rr=sf_floatalloc(nzx);
	reflgen(nz, nx, spz, spx, rectz, rectx, 0, rr);
    
    fwf=sf_floatalloc3(rnz, rnx, wfnt);
    bwf=sf_floatalloc3(rnz, rnx, wfnt);
    img1=sf_floatalloc3(rnz, vnx, ntau);
    img2=sf_floatalloc2(rnz, vnx);
    mig1=sf_floatalloc3(rnz, rnx, ntau);
    mig2=sf_floatalloc2(rnz, rnx);
    
    ccr=sf_floatalloc2(rnz, rnx);
    sill=sf_floatalloc2(rnz, rnx);
    
    curr=sf_complexalloc(fnzx);
	cwave=sf_complexalloc(nk);
	cwavem=sf_complexalloc(nk);
    icfft2_allocate(cwavem);
    
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz, itau)
#endif
    for(ix=0; ix<vnx; ix++){
        for(iz=0; iz<rnz; iz++){
            img2[ix][iz]=0.;
            for(itau=0; itau<ntau; itau++){
                img1[itau][ix][iz]=0.;
            }
        }
    }

	path1=sf_getstring("path1");
	path2=sf_getstring("path2");
	if(path1==NULL) path1="./mat/left";
	if(path2==NULL) path2="./mat/right";
	/* shot loop */
	for (is=cpuid; is<ns; is+=numprocs){
		/* construct the names of left and right matrices */
		left=sf_charalloc(strlen(path1));
		right=sf_charalloc(strlen(path2));
		strcpy(left, path1);
		strcpy(right, path2);
		sprintf(number, "%d", is+1);
		strcat(left, number);
		strcat(right, number);

		Fleft=sf_input(left);
		Fright=sf_input(right);
		
		if(!sf_histint(Fleft, "n1", &n) || n != nzx) sf_error("Need n1=%d in Fleft", nzx);
		if(!sf_histint(Fleft, "n2", &m)) sf_error("No n2 in Fleft");
		if(!sf_histint(Fright, "n1", &n) || n != m) sf_error("Need n1=%d in Fright", m);
		if(!sf_histint(Fright, "n2", &n) || n != nk) sf_error("Need n2=%d in Fright", nk);
		
		/* allocate storage for each shot migration */
		lt=sf_complexalloc2(nzx, m);
		rt=sf_complexalloc2(m, nk);
		sf_complexread(lt[0], nzx*m, Fleft);
		sf_complexread(rt[0], m*nk, Fright);
        sf_fileclose(Fleft);
		sf_fileclose(Fright);
        
        /* read data */
        sf_seek(Fdat, is*rnx*nt*sizeof(float complex), SEEK_SET);
        sf_complexread(dd[0], rnx*nt, Fdat);
        
        /* initialize curr and imaging variables */
#ifdef _OPENMP
#pragma omp parallel for private(iz)
#endif
		for(iz=0; iz<fnzx; iz++){
			curr[iz]=sf_cmplx(0.,0.);
		}
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz, itau)
#endif
        for(ix=0; ix<rnx; ix++){
            for(iz=0; iz<rnz; iz++){
                mig2[ix][iz]=0.;
                ccr[ix][iz]=0.;
                sill[ix][iz]=0.;
                for(itau=0; itau<ntau; itau++){
                    mig1[itau][ix][iz]=0.;
                }
            }
        }
        
        /* wave */
		wave=sf_complexalloc2(fnzx, m);
        
        /* snapshot */
        if(wantwf && is== snapshot) wantwf=true;
        else wantwf=false;
		
		wfit=0;
		for(it=0; it<nt; it++){
			if(verb) sf_warning("Forward propagation it=%d/%d",it+1, nt);
			
			cfft2(curr, cwave);
			for(im=0; im<m; im++){
#ifdef _OPENMP
#pragma omp parallel for private(ik)
#endif
				for(ik=0; ik<nk; ik++){
#ifdef SF_HAS_COMPLEX_H
					cwavem[ik]=cwave[ik]*rt[ik][im];
#else
					cwavem[ik]=sf_cmul(cwave[ik],rt[ik][im]);
#endif
				}
				icfft2(wave[im],cwavem);
			}

#ifdef _OPENMP
#pragma omp parallel for private(ix, iz, i, j, im, c) shared(curr, it)
#endif
			for(ix=0; ix<nx; ix++){
				for(iz=0; iz<nz; iz++){
					i=iz+ix*nz;
					j=iz+ix*fnz;
					
					if(it<trunc){
#ifdef SF_HAS_COMPLEX_H
						c=ww[it]*rr[i];
#else
						c=sf_crmul(ww[it],rr[i]);
#endif
					}else{
						c=sf_cmplx(0.,0.);
					}
					
//                    c += curr[j];
                    
					for(im=0; im<m; im++){
#ifdef SF_HAS_COMPLEX_H
						c += lt[im][i]*wave[im][j];
#else
						c += sf_cmul(lt[im][i], wave[im][j]);
#endif
					}
					curr[j]=c;
				}
			}
			
			if(it%scalet==0){
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
				for(ix=0; ix<rnx; ix++){
                    for(iz=0; iz<rnz; iz++){
                        fwf[wfit][ix][iz]=crealf(curr[(ix+nb)*fnz+(iz+nb)]);
                    }
                }
                wfit++;
            }
        } //end of it
        
        /* check wfnt */
        if(wfit != wfnt) sf_error("At this point, wfit should be equal to wfnt");

        
        /* backward propagation starts from here... */
#ifdef _OPENMP
#pragma omp parallel for private(iz)
#endif
		for(iz=0; iz<fnzx; iz++){
			curr[iz]=sf_cmplx(0.,0.);
		}
        
        wfit=wfnt-1;
        for(it=nt-1; it>=0; it--){
            if(verb) sf_warning("Backward propagation it=%d/%d",it+1, nt);
#ifdef _OPENMP
#pragma omp parallel for private(ix)
#endif
            for(ix=0; ix<rnx; ix++){
                curr[(ix+nb)*fnz+gpz]+=dd[ix][it];
            }
            
            cfft2(curr, cwave);
            
			for(im=0; im<m; im++){
#ifdef _OPENMP
#pragma omp parallel for private(ik)
#endif
				for(ik=0; ik<nk; ik++){
#ifdef SF_HAS_COMPLEX_H
					cwavem[ik]=cwave[ik]*conjf(rt[ik][im]);
#else
					cwavem[ik]=sf_cmul(cwave[ik],conjf(rt[ik][im]));
#endif
				}
				icfft2(wave[im],cwavem);
			}
            
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz, i, j, im, c) shared(curr, it)
#endif
			for(ix=0; ix<nx; ix++){
				for(iz=0; iz<nz; iz++){
					i=iz+ix*nz;
					j=iz+ix*fnz;
					
//                    c=curr[j];
                      c=sf_cmplx(0.,0.);
					
					for(im=0; im<m; im++){
#ifdef SF_HAS_COMPLEX_H
						c += conjf(lt[im][i])*wave[im][j];
#else
						c += sf_cmul(conjf(lt[im][i]), wave[im][j]);
#endif
					}
					curr[j]=c;
				}
			}
			
			if(it%scalet==0){
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
				for(ix=0; ix<rnx; ix++){
                    for(iz=0; iz<rnz; iz++){
                        bwf[wfit][ix][iz]=crealf(curr[(ix+nb)*fnz+(iz+nb)]);
                        ccr[ix][iz] += fwf[wfit][ix][iz]*bwf[wfit][ix][iz];
                        sill[ix][iz] += fwf[wfit][ix][iz]*fwf[wfit][ix][iz];
                    }
                }
                wfit--;
            }
        } //end of it
        if(wfit != -1) sf_error("Check program! The final wfit should be -1!");

        /* free storage */
        free(*rt); free(rt);
        free(*lt); free(lt);
        free(*wave); free(wave);
        free(left); free(right);
        
        /* normalized image */
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
        for (ix=0; ix<rnx; ix++){
            for(iz=0; iz<rnz; iz++){
                mig2[ix][iz]=ccr[ix][iz]/(sill[ix][iz]+SF_EPS);
		//		sill[ix][iz]=0.;
            }
        }
        
        /* calculate time shift gathers */
        for(itau=0; itau<ntau; itau++){
			sf_warning("itau/ntau=%d/%d", itau+1, ntau);
            tau=itau*dtau+tau0;
            htau=tau/wfdt;

            for(it=abs(htau); it<wfnt-abs(htau); it++){
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
                for(ix=0; ix<rnx; ix++){
                    for(iz=0; iz<rnz; iz++){
                        mig1[itau][ix][iz]+=fwf[it+htau][ix][iz]*bwf[it-htau][ix][iz];
		//				sill[ix][iz]+=fwf[it+htau][ix][iz]*fwf[it+htau][ix][iz];
                    } // end of iz
                } // end of ix
            } // end of it
			
/*
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif */
			/* source illumination
			for(ix=0; ix<rnx; ix++){
				for(iz=0; iz<rnz; iz++){
					mig1[itau][ix][iz] = mig1[itau][ix][iz]/(sill[ix][iz]+SF_EPS);
				}
			} */
        } //end of itau
        
        /* output wavefield snapshot */
        if(wantwf){
            for(it=0; it<wfnt; it++){
                if(it%snap==0){
                    sf_floatwrite(fwf[it][0], rnzx, Ffwf);
                    sf_floatwrite(bwf[wfnt-1-it][0], rnzx, Fbwf);
                }
            }
            sf_fileclose(Ffwf);
            sf_fileclose(Fbwf);
        }
        
        /* add all the shot images that are on the same node */
#ifdef _OPENMP
#pragma omp parallel for private(itau, ix, iz)
#endif
        for(itau=0; itau<ntau; itau++){
            for(ix=0; ix<rnx; ix++){
                for(iz=0; iz<rnz; iz++){
                    img1[itau][ix+is*nds][iz] += mig1[itau][ix][iz];
                }
            }
        }
        
#ifdef _OPENMP
#pragma omp parallel for private(ix, iz)
#endif
        for(ix=0; ix<rnx; ix++){
            for(iz=0; iz<rnz; iz++){
                img2[ix+is*nds][iz] += mig2[ix][iz];
            }
        }
	}
    ////////////////end of ishot
	MPI_Barrier(comm);
    
    cfft2_finalize();
    sf_fileclose(Fdat);
    
    free(ww); free(rr);
	free(*dd); free(dd);
	free(cwave); free(cwavem); free(curr);
    free(*ccr); free(ccr);
    free(*sill); free(sill);
    free(**fwf); free(*fwf); free(fwf);
    free(**bwf); free(*bwf); free(bwf);
    free(**mig1); free(*mig1); free(mig1);
    free(*mig2); free(mig2);
    
    /* sum image */
    if(cpuid==0){
        sendbuf=(float *)MPI_IN_PLACE;
        recvbuf=img1[0][0];
    }else{
        sendbuf=img1[0][0];
        recvbuf=NULL;
    }
    MPI_Reduce(sendbuf, recvbuf, ntau*vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm);
    
    if(cpuid==0){
        sendbuf=MPI_IN_PLACE;
        recvbuf=img2[0];
    }else{
        sendbuf=img2[0];
        recvbuf=NULL;
    }
    MPI_Reduce(sendbuf, recvbuf, vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm);
    
    /* output image */
    if(cpuid==0){
        sf_floatwrite(img1[0][0], ntau*vnx*rnz, Fimg1);
        sf_floatwrite(img2[0], vnx*rnz, Fimg2);
    }
	MPI_Barrier(comm);

	sf_fileclose(Fimg1);
    sf_fileclose(Fimg2);
    free(**img1); free(*img1); free(img1);
    free(*img2); free(img2);
    
	gettimeofday(&tim, NULL);
	tend=tim.tv_sec+(tim.tv_usec/1000000.0);
	sf_warning(">> The computing time is %.3lf minutes <<", (tend-tstart)/60.);

	MPI_Finalize();
	exit(0);
}
Пример #17
0
double calc_forces(double* xi_opt, double* forces, int flag)
{
  double tmpsum, sum = 0.0;
  int first, col, ne, size, i = flag;
  double* xi = NULL;
  apot_table_t* apt = &g_pot.apot_table;
  double charge[g_param.ntypes];
  double sum_charges;
  double dp_kappa;

#if defined(DIPOLE)
  double dp_alpha[g_param.ntypes];
  double dp_b[g_calc.paircol];
  double dp_c[g_calc.paircol];
#endif  // DIPOLE

  static double rho_sum_loc, rho_sum;
  rho_sum_loc = rho_sum = 0.0;

  switch (g_pot.format_type) {
    case POTENTIAL_FORMAT_UNKNOWN:
      error(1, "Unknown potential format detected! (%s:%d)\n", __FILE__, __LINE__);
    case POTENTIAL_FORMAT_ANALYTIC:
      xi = g_pot.calc_pot.table;
      break;
    case POTENTIAL_FORMAT_TABULATED_EQ_DIST:
    case POTENTIAL_FORMAT_TABULATED_NON_EQ_DIST:
      xi = xi_opt;
      break;
    case POTENTIAL_FORMAT_KIM:
      error(1, "KIM format is not supported by EAM elstat force routine!");
  }

#if !defined(MPI)
  g_mpi.myconf = g_config.nconf;
#endif  // MPI

  ne = g_pot.apot_table.total_ne_par;
  size = apt->number;

  /* This is the start of an infinite loop */
  while (1) {
    tmpsum = 0.0; /* sum of squares of local process */
    rho_sum_loc = 0.0;

#if defined APOT && !defined MPI
    if (g_pot.format_type == POTENTIAL_FORMAT_ANALYTIC) {
      apot_check_params(xi_opt);
      update_calc_table(xi_opt, xi, 0);
    }
#endif  // APOT && !MPI

#if defined(MPI)
/* exchange potential and flag value */
#if !defined(APOT)
    MPI_Bcast(xi, g_pot.calc_pot.len, MPI_DOUBLE, 0, MPI_COMM_WORLD);
#endif  // APOT
    MPI_Bcast(&flag, 1, MPI_INT, 0, MPI_COMM_WORLD);

    if (flag == 1)
      break; /* Exception: flag 1 means clean up */

#if defined(APOT)
    if (g_mpi.myid == 0)
      apot_check_params(xi_opt);
    MPI_Bcast(xi_opt, g_calc.ndimtot, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    if (g_pot.format_type == POTENTIAL_FORMAT_ANALYTIC)
      update_calc_table(xi_opt, xi, 0);
#else   /* APOT */
    /* if flag==2 then the potential parameters have changed -> sync */
    if (flag == 2)
      potsync();
#endif  // APOT
#endif  // MPI

    /* local arrays for electrostatic parameters */
    sum_charges = 0;
    for (i = 0; i < g_param.ntypes - 1; i++) {
      if (xi_opt[2 * size + ne + i]) {
        charge[i] = xi_opt[2 * size + ne + i];
        sum_charges += apt->ratio[i] * charge[i];
      } else {
        charge[i] = 0.0;
      }
    }
    apt->last_charge = -sum_charges / apt->ratio[g_param.ntypes - 1];
    charge[g_param.ntypes - 1] = apt->last_charge;
    if (xi_opt[2 * size + ne + g_param.ntypes - 1]) {
      dp_kappa = xi_opt[2 * size + ne + g_param.ntypes - 1];
    } else {
      dp_kappa = 0.0;
    }

#if defined(DIPOLE)
    for (i = 0; i < g_param.ntypes; i++) {
      if (xi_opt[2 * size + ne + g_param.ntypes + i]) {
        dp_alpha[i] = xi_opt[2 * size + ne + g_param.ntypes + i];
      } else {
        dp_alpha[i] = 0.0;
      }
    }
    for (i = 0; i < g_calc.paircol; i++) {
      if (xi_opt[2 * size + ne + 2 * g_param.ntypes + i]) {
        dp_b[i] = xi_opt[2 * size + ne + 2 * g_param.ntypes + i];
      } else {
        dp_b[i] = 0.0;
      }
      if (xi_opt[2 * size + ne + 2 * g_param.ntypes + g_calc.paircol + i]) {
        dp_c[i] =
            xi_opt[2 * size + ne + 2 * g_param.ntypes + g_calc.paircol + i];
      } else {
        dp_c[i] = 0.0;
      }
    }
#endif  // DIPOLE

    /* init second derivatives for splines */

    /* pair potentials & rho */
    for (col = 0; col < g_calc.paircol + g_param.ntypes; col++) {
      first = g_pot.calc_pot.first[col];

      switch (g_pot.format_type) {
        case POTENTIAL_FORMAT_UNKNOWN:
          error(1, "Unknown potential format detected! (%s:%d)\n", __FILE__,
                __LINE__);
        case POTENTIAL_FORMAT_ANALYTIC:
        case POTENTIAL_FORMAT_TABULATED_EQ_DIST: {
          spline_ed(g_pot.calc_pot.step[col], xi + first,
                    g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2),
                    0.0, g_pot.calc_pot.d2tab + first);
          break;
        }
        case POTENTIAL_FORMAT_TABULATED_NON_EQ_DIST: {
          spline_ne(g_pot.calc_pot.xcoord + first, xi + first,
                    g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2),
                    0.0, g_pot.calc_pot.d2tab + first);
        }
        case POTENTIAL_FORMAT_KIM:
          error(1, "KIM format is not supported by EAM elstat force routine!");
      }
    }

    /* F */
    for (col = g_calc.paircol + g_param.ntypes;
         col < g_calc.paircol + 2 * g_param.ntypes; col++) {
      first = g_pot.calc_pot.first[col];
      /* gradient at left boundary matched to square root function,
         when 0 not in domain(F), else natural spline */
      switch (g_pot.format_type) {
        case POTENTIAL_FORMAT_UNKNOWN:
          error(1, "Unknown potential format detected! (%s:%d)\n", __FILE__,
                __LINE__);
        case POTENTIAL_FORMAT_ANALYTIC:
        case POTENTIAL_FORMAT_TABULATED_EQ_DIST: {
          spline_ed(g_pot.calc_pot.step[col], xi + first,
                    g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2),
                    *(xi + first - 1), g_pot.calc_pot.d2tab + first);
          break;
        }
        case POTENTIAL_FORMAT_TABULATED_NON_EQ_DIST: {
          spline_ne(g_pot.calc_pot.xcoord + first, xi + first,
                    g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2),
                    *(xi + first - 1), g_pot.calc_pot.d2tab + first);
        }
        case POTENTIAL_FORMAT_KIM:
          error(1, "KIM format is not supported by EAM elstat force routine!");
      }
    }

    /* region containing loop over configurations */
    {
      int self;
      vector tmp_force;
      int h, j, type1, type2, uf;
#if defined(STRESS)
      int us = 0;
      int stresses = 0;
#endif
      int n_i, n_j;
      double fnval, grad, fnval_tail, grad_tail, grad_i, grad_j;
#if defined(DIPOLE)
      double p_sr_tail = 0.0;
#endif
      atom_t* atom;
      neigh_t* neigh;
      double r;
      int col_F;
      double eam_force;
      double rho_val, rho_grad, rho_grad_j;

      /* loop over configurations: M A I N LOOP CONTAINING ALL ATOM-LOOPS */
      for (h = g_mpi.firstconf; h < g_mpi.firstconf + g_mpi.myconf; h++) {
        uf = g_config.conf_uf[h - g_mpi.firstconf];
#if defined(STRESS)
        us = g_config.conf_us[h - g_mpi.firstconf];
#endif  // STRESS
        /* reset energies and stresses */
        forces[g_calc.energy_p + h] = 0.0;
#if defined(STRESS)
        stresses = g_calc.stress_p + 6 * h;
        for (i = 0; i < 6; i++)
          forces[stresses + i] = 0.0;
#endif  // STRESS

        /* set limiting constraints */
        forces[g_calc.limit_p + h] = -g_config.force_0[g_calc.limit_p + h];

#if defined(DIPOLE)
        /* reset dipoles and fields: LOOP Z E R O */
        for (i = 0; i < g_config.inconf[h]; i++) {
          atom =
              g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom;
          atom->E_stat.x = 0.0;
          atom->E_stat.y = 0.0;
          atom->E_stat.z = 0.0;
          atom->p_sr.x = 0.0;
          atom->p_sr.y = 0.0;
          atom->p_sr.z = 0.0;
        }
#endif  // DIPOLE

        /* F I R S T LOOP OVER ATOMS: reset forces, dipoles */
        for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
          n_i = 3 * (g_config.cnfstart[h] + i);
          if (uf) {
            forces[n_i + 0] = -g_config.force_0[n_i + 0];
            forces[n_i + 1] = -g_config.force_0[n_i + 1];
            forces[n_i + 2] = -g_config.force_0[n_i + 2];
          } else {
            forces[n_i + 0] = 0.0;
            forces[n_i + 1] = 0.0;
            forces[n_i + 2] = 0.0;
          }
          /* reset atomic density */
          g_config.conf_atoms[g_config.cnfstart[h] - g_mpi.firstatom + i].rho =
              0.0;
        } /* end F I R S T LOOP */

        /* S E C O N D loop: calculate short-range and monopole forces,
           calculate static field- and dipole-contributions,
           calculate atomic densities */
        for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
          atom =
              g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom;
          type1 = atom->type;
          n_i = 3 * (g_config.cnfstart[h] + i);
          for (j = 0; j < atom->num_neigh; j++) { /* neighbors */
            neigh = atom->neigh + j;
            type2 = neigh->type;
            col = neigh->col[0];

            /* updating tail-functions - only necessary with variing kappa */
            if (!apt->sw_kappa)
#if defined(DSF)
              elstat_dsf(neigh->r, dp_kappa, &neigh->fnval_el,
                           &neigh->grad_el, &neigh->ggrad_el);
#else
              elstat_shift(neigh->r, dp_kappa, &neigh->fnval_el,
                           &neigh->grad_el, &neigh->ggrad_el);
#endif // DSF

            /* In small cells, an atom might interact with itself */
            self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0;

            /* calculate short-range forces */
            if (neigh->r < g_pot.calc_pot.end[col]) {
              if (uf) {
                fnval = splint_comb_dir(&g_pot.calc_pot, xi, neigh->slot[0],
                                        neigh->shift[0], neigh->step[0], &grad);
              } else {
                fnval = splint_dir(&g_pot.calc_pot, xi, neigh->slot[0],
                                   neigh->shift[0], neigh->step[0]);
              }

              /* avoid double counting if atom is interacting with a copy of
               * itself */
              if (self) {
                fnval *= 0.5;
                grad *= 0.5;
              }
              forces[g_calc.energy_p + h] += fnval;

              if (uf) {
                tmp_force.x = neigh->dist_r.x * grad;
                tmp_force.y = neigh->dist_r.y * grad;
                tmp_force.z = neigh->dist_r.z * grad;
                forces[n_i + 0] += tmp_force.x;
                forces[n_i + 1] += tmp_force.y;
                forces[n_i + 2] += tmp_force.z;
                /* actio = reactio */
                n_j = 3 * neigh->nr;
                forces[n_j + 0] -= tmp_force.x;
                forces[n_j + 1] -= tmp_force.y;
                forces[n_j + 2] -= tmp_force.z;
#if defined(STRESS)
                /* calculate pair stresses */
                if (us) {
                  forces[stresses + 0] -= neigh->dist.x * tmp_force.x;
                  forces[stresses + 1] -= neigh->dist.y * tmp_force.y;
                  forces[stresses + 2] -= neigh->dist.z * tmp_force.z;
                  forces[stresses + 3] -= neigh->dist.x * tmp_force.y;
                  forces[stresses + 4] -= neigh->dist.y * tmp_force.z;
                  forces[stresses + 5] -= neigh->dist.z * tmp_force.x;
                }
#endif  // STRESS
              }
            }

            /* calculate monopole forces */
            if (neigh->r < g_config.dp_cut &&
                (charge[type1] || charge[type2])) {
              fnval_tail = neigh->fnval_el;
              grad_tail = neigh->grad_el;

              grad_i = charge[type2] * grad_tail;
              if (type1 == type2) {
                grad_j = grad_i;
              } else {
                grad_j = charge[type1] * grad_tail;
              }
              fnval = charge[type1] * charge[type2] * fnval_tail;
              grad = charge[type1] * grad_i;

              if (self) {
                grad_i *= 0.5;
                grad_j *= 0.5;
                fnval *= 0.5;
                grad *= 0.5;
              }

              forces[g_calc.energy_p + h] += fnval;

              if (uf) {
                tmp_force.x = neigh->dist.x * grad;
                tmp_force.y = neigh->dist.y * grad;
                tmp_force.z = neigh->dist.z * grad;
                forces[n_i + 0] += tmp_force.x;
                forces[n_i + 1] += tmp_force.y;
                forces[n_i + 2] += tmp_force.z;
                /* actio = reactio */
                n_j = 3 * neigh->nr;
                forces[n_j + 0] -= tmp_force.x;
                forces[n_j + 1] -= tmp_force.y;
                forces[n_j + 2] -= tmp_force.z;
#if defined(STRESS)
                /* calculate coulomb stresses */
                if (us) {
                  forces[stresses + 0] -= neigh->dist.x * tmp_force.x;
                  forces[stresses + 1] -= neigh->dist.y * tmp_force.y;
                  forces[stresses + 2] -= neigh->dist.z * tmp_force.z;
                  forces[stresses + 3] -= neigh->dist.x * tmp_force.y;
                  forces[stresses + 4] -= neigh->dist.y * tmp_force.z;
                  forces[stresses + 5] -= neigh->dist.z * tmp_force.x;
                }
#endif  // STRESS
              }
#if defined(DIPOLE)
              /* calculate static field-contributions */
              atom->E_stat.x += neigh->dist.x * grad_i;
              atom->E_stat.y += neigh->dist.y * grad_i;
              atom->E_stat.z += neigh->dist.z * grad_i;

              g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_stat.x -=
                  neigh->dist.x * grad_j;
              g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_stat.y -=
                  neigh->dist.y * grad_j;
              g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_stat.z -=
                  neigh->dist.z * grad_j;

              /* calculate short-range dipoles */
              if (dp_alpha[type1] && dp_b[col] && dp_c[col]) {
                p_sr_tail = grad_tail * neigh->r *
                            shortrange_value(neigh->r, dp_alpha[type1],
                                             dp_b[col], dp_c[col]);
                atom->p_sr.x += charge[type2] * neigh->dist_r.x * p_sr_tail;
                atom->p_sr.y += charge[type2] * neigh->dist_r.y * p_sr_tail;
                atom->p_sr.z += charge[type2] * neigh->dist_r.z * p_sr_tail;
              }
              if (dp_alpha[type2] && dp_b[col] && dp_c[col] && !self) {
                p_sr_tail = grad_tail * neigh->r *
                            shortrange_value(neigh->r, dp_alpha[type2],
                                             dp_b[col], dp_c[col]);
                g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_sr.x -=
                    charge[type1] * neigh->dist_r.x * p_sr_tail;
                g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_sr.y -=
                    charge[type1] * neigh->dist_r.y * p_sr_tail;
                g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_sr.z -=
                    charge[type1] * neigh->dist_r.z * p_sr_tail;
              }
#endif  // DIPOLE
            }

            /* calculate atomic densities */
            if (atom->type == neigh->type) {
              /* then transfer(a->b)==transfer(b->a) */
              if (neigh->r < g_pot.calc_pot.end[neigh->col[1]]) {
                rho_val = splint_dir(&g_pot.calc_pot, xi, neigh->slot[1],
                                     neigh->shift[1], neigh->step[1]);
                atom->rho += rho_val;
                /* avoid double counting if atom is interacting with a
                   copy of itself */
                if (!self) {
                  g_config.conf_atoms[neigh->nr - g_mpi.firstatom].rho +=
                      rho_val;
                }
              }
            } else {
              /* transfer(a->b)!=transfer(b->a) */
              if (neigh->r < g_pot.calc_pot.end[neigh->col[1]]) {
                atom->rho += splint_dir(&g_pot.calc_pot, xi, neigh->slot[1],
                                        neigh->shift[1], neigh->step[1]);
              }
              /* cannot use slot/shift to access splines */
              if (neigh->r < g_pot.calc_pot.end[g_calc.paircol + atom->type])
                g_config.conf_atoms[neigh->nr - g_mpi.firstatom].rho +=
                    (*g_splint)(&g_pot.calc_pot, xi,
                                g_calc.paircol + atom->type, neigh->r);
            }

          } /* loop over neighbours */

          col_F =
              g_calc.paircol + g_param.ntypes + atom->type; /* column of F */
          if (atom->rho > g_pot.calc_pot.end[col_F]) {
            /* then punish target function -> bad potential */
            forces[g_calc.limit_p + h] +=
                DUMMY_WEIGHT * 10.0 *
                dsquare(atom->rho - g_pot.calc_pot.end[col_F]);
            atom->rho = g_pot.calc_pot.end[col_F];
          }

          if (atom->rho < g_pot.calc_pot.begin[col_F]) {
            /* then punish target function -> bad potential */
            forces[g_calc.limit_p + h] +=
                DUMMY_WEIGHT * 10.0 *
                dsquare(g_pot.calc_pot.begin[col_F] - atom->rho);
            atom->rho = g_pot.calc_pot.begin[col_F];
          }

/* embedding energy, embedding gradient */
/* contribution to cohesive energy is F(n) */
#if defined(NORESCALE)
          if (atom->rho < g_pot.calc_pot.begin[col_F]) {
            /* linear extrapolation left */
            rho_val = splint_comb(&calc_pot, xi, col_F,
                                  g_pot.calc_pot.begin[col_F], &atom->gradF);
            forces[energy_p + h] +=
                rho_val +
                (atom->rho - g_pot.calc_pot.begin[col_F]) * atom->gradF;
#if defined(APOT)
            forces[limit_p + h] += DUMMY_WEIGHT * 10.0 *
                                   dsquare(calc_pot.begin[col_F] - atom->rho);
#endif  // APOT
          } else if (atom->rho > g_pot.calc_pot.end[col_F]) {
            /* and right */
            rho_val = splint_comb(
                &calc_pot, xi, col_F,
                g_pot.calc_pot.end[col_F] - 0.5 * g_pot.calc_pot.step[col_F],
                &atom->gradF);
            forces[energy_p + h] +=
                rho_val + (atom->rho - g_pot.calc_pot.end[col_F]) * atom->gradF;
#if defined(APOT)
            forces[limit_p + h] +=
                DUMMY_WEIGHT * 10.0 *
                dsquare(atom->rho - g_pot.calc_pot.end[col_F]);
#endif  // APOT
          }
          /* and in-between */
          else {
            forces[energy_p + h] +=
                splint_comb(&calc_pot, xi, col_F, atom->rho, &atom->gradF);
          }
#else
          forces[g_calc.energy_p + h] += (*g_splint_comb)(
              &g_pot.calc_pot, xi, col_F, atom->rho, &atom->gradF);
#endif  // NORESCALE
          /* sum up rho */
          rho_sum_loc += atom->rho;

        } /* end S E C O N D loop over atoms */

#if defined(DIPOLE)
        /* T H I R D loop: calculate whole dipole moment for every atom */
        double rp, dp_sum;
        int dp_converged = 0, dp_it = 0;
        double max_diff = 10;

        while (dp_converged == 0) {
          dp_sum = 0;
          for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
            atom = g_config.conf_atoms + i + g_config.cnfstart[h] -
                   g_mpi.firstatom;
            type1 = atom->type;
            if (dp_alpha[type1]) {
              if (dp_it) {
                /* note: mixing parameter is different from that on in IMD */
                atom->E_tot.x = (1 - g_config.dp_mix) * atom->E_ind.x +
                                g_config.dp_mix * atom->E_old.x +
                                atom->E_stat.x;
                atom->E_tot.y = (1 - g_config.dp_mix) * atom->E_ind.y +
                                g_config.dp_mix * atom->E_old.y +
                                atom->E_stat.y;
                atom->E_tot.z = (1 - g_config.dp_mix) * atom->E_ind.z +
                                g_config.dp_mix * atom->E_old.z +
                                atom->E_stat.z;
              } else {
                atom->E_tot.x = atom->E_ind.x + atom->E_stat.x;
                atom->E_tot.y = atom->E_ind.y + atom->E_stat.y;
                atom->E_tot.z = atom->E_ind.z + atom->E_stat.z;
              }

              atom->p_ind.x = dp_alpha[type1] * atom->E_tot.x + atom->p_sr.x;
              atom->p_ind.y = dp_alpha[type1] * atom->E_tot.y + atom->p_sr.y;
              atom->p_ind.z = dp_alpha[type1] * atom->E_tot.z + atom->p_sr.z;

              atom->E_old.x = atom->E_ind.x;
              atom->E_old.y = atom->E_ind.y;
              atom->E_old.z = atom->E_ind.z;

              atom->E_ind.x = 0.0;
              atom->E_ind.y = 0.0;
              atom->E_ind.z = 0.0;
            }
          }

          for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
            atom = g_config.conf_atoms + i + g_config.cnfstart[h] -
                   g_mpi.firstatom;
            type1 = atom->type;
            for (j = 0; j < atom->num_neigh; j++) { /* neighbors */
              neigh = atom->neigh + j;
              type2 = neigh->type;
              col = neigh->col[0];
              /* In small cells, an atom might interact with itself */
              self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0;

              if (neigh->r < g_config.dp_cut && dp_alpha[type1] &&
                  dp_alpha[type2]) {
                rp = SPROD(
                    g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind,
                    neigh->dist_r);
                atom->E_ind.x +=
                    neigh->grad_el *
                    (3 * rp * neigh->dist_r.x -
                     g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.x);
                atom->E_ind.y +=
                    neigh->grad_el *
                    (3 * rp * neigh->dist_r.y -
                     g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.y);
                atom->E_ind.z +=
                    neigh->grad_el *
                    (3 * rp * neigh->dist_r.z -
                     g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.z);

                if (!self) {
                  rp = SPROD(atom->p_ind, neigh->dist_r);
                  g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_ind.x +=
                      neigh->grad_el *
                      (3 * rp * neigh->dist_r.x - atom->p_ind.x);
                  g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_ind.y +=
                      neigh->grad_el *
                      (3 * rp * neigh->dist_r.y - atom->p_ind.y);
                  g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_ind.z +=
                      neigh->grad_el *
                      (3 * rp * neigh->dist_r.z - atom->p_ind.z);
                }
              }
            }
          }

          for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
            atom = g_config.conf_atoms + i + g_config.cnfstart[h] -
                   g_mpi.firstatom;
            type1 = atom->type;
            if (dp_alpha[type1]) {
              dp_sum +=
                  dsquare(dp_alpha[type1] * (atom->E_old.x - atom->E_ind.x));
              dp_sum +=
                  dsquare(dp_alpha[type1] * (atom->E_old.y - atom->E_ind.y));
              dp_sum +=
                  dsquare(dp_alpha[type1] * (atom->E_old.z - atom->E_ind.z));
            }
          }

          dp_sum /= 3 * g_config.inconf[h];
          dp_sum = sqrt(dp_sum);

          if (dp_it) {
            if ((dp_sum > max_diff) || (dp_it > 50)) {
              dp_converged = 1;
              for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
                atom = g_config.conf_atoms + i + g_config.cnfstart[h] -
                       g_mpi.firstatom;
                type1 = atom->type;
                if (dp_alpha[type1]) {
                  atom->p_ind.x =
                      dp_alpha[type1] * atom->E_stat.x + atom->p_sr.x;
                  atom->p_ind.y =
                      dp_alpha[type1] * atom->E_stat.y + atom->p_sr.y;
                  atom->p_ind.z =
                      dp_alpha[type1] * atom->E_stat.z + atom->p_sr.z;
                  atom->E_ind.x = atom->E_stat.x;
                  atom->E_ind.y = atom->E_stat.y;
                  atom->E_ind.z = atom->E_stat.z;
                }
              }
            }
          }

          if (dp_sum < g_config.dp_tol)
            dp_converged = 1;

          dp_it++;
        } /* end T H I R D loop over atoms */

        /* F O U R T H  loop: calculate monopole-dipole and dipole-dipole forces
         */
        double rp_i, rp_j, pp_ij, tmp_1, tmp_2;
        double grad_1, grad_2, srval, srgrad, srval_tail, srgrad_tail,
            fnval_sum, grad_sum;

        for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
          atom =
              g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom;
          type1 = atom->type;
          n_i = 3 * (g_config.cnfstart[h] + i);
          for (j = 0; j < atom->num_neigh; j++) { /* neighbors */
            neigh = atom->neigh + j;
            type2 = neigh->type;
            col = neigh->col[0];

            /* In small cells, an atom might interact with itself */
            self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0;
            if (neigh->r < g_config.dp_cut &&
                (dp_alpha[type1] || dp_alpha[type2])) {
              fnval_tail = -neigh->grad_el;
              grad_tail = -neigh->ggrad_el;

              if (dp_b[col] && dp_c[col]) {
                shortrange_term(neigh->r, dp_b[col], dp_c[col], &srval_tail,
                                &srgrad_tail);
                srval = fnval_tail * srval_tail;
                srgrad = fnval_tail * srgrad_tail + grad_tail * srval_tail;
              }

              if (self) {
                fnval_tail *= 0.5;
                grad_tail *= 0.5;
              }

              /* monopole-dipole contributions */
              if (charge[type1] && dp_alpha[type2]) {
                if (dp_b[col] && dp_c[col]) {
                  fnval_sum = fnval_tail + srval;
                  grad_sum = grad_tail + srgrad;
                } else {
                  fnval_sum = fnval_tail;
                  grad_sum = grad_tail;
                }

                rp_j = SPROD(
                    g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind,
                    neigh->dist_r);
                fnval = charge[type1] * rp_j * fnval_sum * neigh->r;
                grad_1 = charge[type1] * rp_j * grad_sum * neigh->r2;
                grad_2 = charge[type1] * fnval_sum;

                forces[g_calc.energy_p + h] -= fnval;

                if (uf) {
                  tmp_force.x =
                      neigh->dist_r.x * grad_1 +
                      g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.x *
                          grad_2;
                  tmp_force.y =
                      neigh->dist_r.y * grad_1 +
                      g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.y *
                          grad_2;
                  tmp_force.z =
                      neigh->dist_r.z * grad_1 +
                      g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.z *
                          grad_2;
                  forces[n_i + 0] -= tmp_force.x;
                  forces[n_i + 1] -= tmp_force.y;
                  forces[n_i + 2] -= tmp_force.z;
                  /* actio = reactio */
                  n_j = 3 * neigh->nr;
                  forces[n_j + 0] += tmp_force.x;
                  forces[n_j + 1] += tmp_force.y;
                  forces[n_j + 2] += tmp_force.z;

#if defined(STRESS)
                  /* calculate stresses */
                  if (us) {
                    forces[stresses + 0] += neigh->dist.x * tmp_force.x;
                    forces[stresses + 1] += neigh->dist.y * tmp_force.y;
                    forces[stresses + 2] += neigh->dist.z * tmp_force.z;
                    forces[stresses + 3] += neigh->dist.x * tmp_force.y;
                    forces[stresses + 4] += neigh->dist.y * tmp_force.z;
                    forces[stresses + 5] += neigh->dist.z * tmp_force.x;
                  }
#endif  // STRESS
                }
              }

              /* dipole-monopole contributions */
              if (dp_alpha[type2] && charge[type2]) {
                if (dp_b[col] && dp_c[col]) {
                  fnval_sum = fnval_tail + srval;
                  grad_sum = grad_tail + srgrad;
                } else {
                  fnval_sum = fnval_tail;
                  grad_sum = grad_tail;
                }

                rp_i = SPROD(atom->p_ind, neigh->dist_r);
                fnval = charge[type2] * rp_i * fnval_sum * neigh->r;
                grad_1 = charge[type2] * rp_i * grad_sum * neigh->r2;
                grad_2 = charge[type2] * fnval_sum;

                forces[g_calc.energy_p + h] += fnval;

                if (uf) {
                  tmp_force.x =
                      neigh->dist_r.x * grad_1 + atom->p_ind.x * grad_2;
                  tmp_force.y =
                      neigh->dist_r.y * grad_1 + atom->p_ind.y * grad_2;
                  tmp_force.z =
                      neigh->dist_r.z * grad_1 + atom->p_ind.z * grad_2;
                  forces[n_i + 0] += tmp_force.x;
                  forces[n_i + 1] += tmp_force.y;
                  forces[n_i + 2] += tmp_force.z;
                  /* actio = reactio */
                  n_j = 3 * neigh->nr;
                  forces[n_j + 0] -= tmp_force.x;
                  forces[n_j + 1] -= tmp_force.y;
                  forces[n_j + 2] -= tmp_force.z;

#if defined(STRESS)
                  /* calculate stresses */
                  if (us) {
                    forces[stresses + 0] -= neigh->dist.x * tmp_force.x;
                    forces[stresses + 1] -= neigh->dist.y * tmp_force.y;
                    forces[stresses + 2] -= neigh->dist.z * tmp_force.z;
                    forces[stresses + 3] -= neigh->dist.x * tmp_force.y;
                    forces[stresses + 4] -= neigh->dist.y * tmp_force.z;
                    forces[stresses + 5] -= neigh->dist.z * tmp_force.x;
                  }
#endif  // STRESS
                }
              }

              /* dipole-dipole contributions */
              if (dp_alpha[type1] && dp_alpha[type2]) {
                pp_ij = SPROD(
                    atom->p_ind,
                    g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind);
                tmp_1 = 3 * rp_i * rp_j;
                tmp_2 = 3 * fnval_tail / neigh->r2;

                fnval = -(tmp_1 - pp_ij) * fnval_tail;
                grad_1 = (tmp_1 - pp_ij) * grad_tail;
                grad_2 = 2 * rp_i * rp_j;

                forces[g_calc.energy_p + h] += fnval;

                if (uf) {
                  tmp_force.x =
                      grad_1 * neigh->dist.x -
                      tmp_2 *
                          (grad_2 * neigh->dist.x -
                           rp_i * neigh->r *
                               g_config.conf_atoms[neigh->nr - g_mpi.firstatom]
                                   .p_ind.x -
                           rp_j * neigh->r * atom->p_ind.x);
                  tmp_force.y =
                      grad_1 * neigh->dist.y -
                      tmp_2 *
                          (grad_2 * neigh->dist.y -
                           rp_i * neigh->r *
                               g_config.conf_atoms[neigh->nr - g_mpi.firstatom]
                                   .p_ind.y -
                           rp_j * neigh->r * atom->p_ind.y);
                  tmp_force.z =
                      grad_1 * neigh->dist.z -
                      tmp_2 *
                          (grad_2 * neigh->dist.z -
                           rp_i * neigh->r *
                               g_config.conf_atoms[neigh->nr - g_mpi.firstatom]
                                   .p_ind.z -
                           rp_j * neigh->r * atom->p_ind.z);
                  forces[n_i + 0] -= tmp_force.x;
                  forces[n_i + 1] -= tmp_force.y;
                  forces[n_i + 2] -= tmp_force.z;
                  /* actio = reactio */
                  n_j = 3 * neigh->nr;
                  forces[n_j + 0] += tmp_force.x;
                  forces[n_j + 1] += tmp_force.y;
                  forces[n_j + 2] += tmp_force.z;

#if defined(STRESS)
                  /* calculate stresses */
                  if (us) {
                    forces[stresses + 0] += neigh->dist.x * tmp_force.x;
                    forces[stresses + 1] += neigh->dist.y * tmp_force.y;
                    forces[stresses + 2] += neigh->dist.z * tmp_force.z;
                    forces[stresses + 3] += neigh->dist.x * tmp_force.y;
                    forces[stresses + 4] += neigh->dist.y * tmp_force.z;
                    forces[stresses + 5] += neigh->dist.z * tmp_force.x;
                  }
#endif  // STRESS
                }
              }
            }
          } /* loop over neighbours */
        }   /* end F O U R T H loop over atoms */
#endif      // DIPOLE

        /* F I F T H  loop: self energy contributions and sum-up force
         * contributions */
        double qq;
#if defined(DSF)
       double fnval_cut, gtail_cut, ggrad_cut;
        elstat_value(g_config.dp_cut, dp_kappa, &fnval_cut, &gtail_cut, &ggrad_cut);
#endif // DSF
        for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */
          atom =
              g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom;
          type1 = atom->type;
          n_i = 3 * (g_config.cnfstart[h] + i);

          /* self energy contributions */
          if (charge[type1]) {
            qq = charge[type1] * charge[type1];
#if defined(DSF)
           fnval = qq * ( DP_EPS * dp_kappa / sqrt(M_PI) +
              (fnval_cut - gtail_cut * g_config.dp_cut * g_config.dp_cut )*0.5 );
#else
             fnval = DP_EPS * dp_kappa * qq / sqrt(M_PI);
#endif // DSF
            forces[g_calc.energy_p + h] -= fnval;
          }
#if defined(DIPOLE)
          double pp;
          if (dp_alpha[type1]) {
            pp = SPROD(atom->p_ind, atom->p_ind);
            fnval = pp / (2 * dp_alpha[type1]);
            forces[g_calc.energy_p + h] += fnval;
          }
/* alternative dipole self energy including kappa-dependence */
// if (dp_alpha[type1]) {
// pp = SPROD(atom->p_ind, atom->p_ind);
// fnval = kkk * pp / sqrt(M_PI);
// forces[energy_p + h] += fnval;
//}
#endif  // DIPOLE

          /* sum-up: whole force contributions flow into tmpsum */
          /*          if (uf) {*/
          /*#ifdef FWEIGHT*/
          /*             Weigh by absolute value of force */
          /*            forces[k] /= FORCE_EPS + atom->absforce;*/
          /*            forces[k + 1] /= FORCE_EPS + atom->absforce;*/
          /*            forces[k + 2] /= FORCE_EPS + atom->absforce;*/
          /*#endif |+ FWEIGHT +|*/
          /*#ifdef CONTRIB*/
          /*            if (atom->contrib)*/
          /*#endif |+ CONTRIB +|*/
          /*              tmpsum +=*/
          /*                conf_weight[h] * (dsquare(forces[k]) +
           * dsquare(forces[k + 1]) + dsquare(forces[k + 2]));*/
          /*            printf("tmpsum = %f (forces)\n",tmpsum);*/
          /*          }*/

        } /* end F I F T H loop over atoms */

        /* S I X T H  loop: EAM force */
        if (uf) { /* only required if we calc forces */
          for (i = 0; i < g_config.inconf[h]; i++) {
            atom = g_config.conf_atoms + i + g_config.cnfstart[h] -
                   g_mpi.firstatom;
            n_i = 3 * (g_config.cnfstart[h] + i);
            for (j = 0; j < atom->num_neigh; j++) {
              /* loop over neighbors */
              neigh = atom->neigh + j;
              /* In small cells, an atom might interact with itself */
              self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0;
              col_F = g_calc.paircol + g_param.ntypes +
                      atom->type; /* column of F */
              r = neigh->r;
              /* are we within reach? */
              if ((r < g_pot.calc_pot.end[neigh->col[1]]) ||
                  (r < g_pot.calc_pot.end[col_F - g_param.ntypes])) {
                rho_grad =
                    (r < g_pot.calc_pot.end[neigh->col[1]])
                        ? splint_grad_dir(&g_pot.calc_pot, xi, neigh->slot[1],
                                          neigh->shift[1], neigh->step[1])
                        : 0.0;
                if (atom->type == neigh->type) /* use actio = reactio */
                  rho_grad_j = rho_grad;
                else
                  rho_grad_j = (r < g_pot.calc_pot.end[col_F - g_param.ntypes])
                                   ? (*g_splint_grad)(&g_pot.calc_pot, xi,
                                                      col_F - g_param.ntypes, r)
                                   : 0.0;
                /* now we know everything - calculate forces */
                eam_force =
                    (rho_grad * atom->gradF +
                     rho_grad_j *
                         g_config.conf_atoms[(neigh->nr) - g_mpi.firstatom]
                             .gradF);
                /* avoid double counting if atom is interacting with a
                   copy of itself */
                if (self)
                  eam_force *= 0.5;
                tmp_force.x = neigh->dist_r.x * eam_force;
                tmp_force.y = neigh->dist_r.y * eam_force;
                tmp_force.z = neigh->dist_r.z * eam_force;
                forces[n_i + 0] += tmp_force.x;
                forces[n_i + 1] += tmp_force.y;
                forces[n_i + 2] += tmp_force.z;
                /* actio = reactio */
                n_j = 3 * neigh->nr;
                forces[n_j + 0] -= tmp_force.x;
                forces[n_j + 1] -= tmp_force.y;
                forces[n_j + 2] -= tmp_force.z;
#if defined(STRESS)
                /* and stresses */
                if (us) {
                  forces[stresses + 0] -= neigh->dist.x * tmp_force.x;
                  forces[stresses + 1] -= neigh->dist.y * tmp_force.y;
                  forces[stresses + 2] -= neigh->dist.z * tmp_force.z;
                  forces[stresses + 3] -= neigh->dist.x * tmp_force.y;
                  forces[stresses + 4] -= neigh->dist.y * tmp_force.z;
                  forces[stresses + 5] -= neigh->dist.z * tmp_force.x;
                }
#endif          // STRESS
              } /* within reach */
            }   /* loop over neighbours */
#if defined(FWEIGHT)
            /* Weigh by absolute value of force */
            forces[n_i + 0] /= FORCE_EPS + atom->absforce;
            forces[n_i + 1] /= FORCE_EPS + atom->absforce;
            forces[n_i + 2] /= FORCE_EPS + atom->absforce;
#endif  // FWEIGHT
        /* sum up forces  */
#if defined(CONTRIB)
            if (atom->contrib)
#endif  // CONTRIB
              tmpsum += g_config.conf_weight[h] *
                        (dsquare(forces[n_i + 0]) + dsquare(forces[n_i + 1]) +
                         dsquare(forces[n_i + 2]));
          }
        }

        /* end S I X T H loop over atoms */
        /* whole energy contributions flow into tmpsum */
        forces[g_calc.energy_p + h] /= (double)g_config.inconf[h];
        forces[g_calc.energy_p + h] -= g_config.force_0[g_calc.energy_p + h];
        tmpsum += g_config.conf_weight[h] * g_param.eweight *
                  dsquare(forces[g_calc.energy_p + h]);

#if defined(STRESS)
        /* whole stress contributions flow into tmpsum */
        if (uf && us) {
          for (i = 0; i < 6; i++) {
            forces[stresses + i] /= g_config.conf_vol[h - g_mpi.firstconf];
            forces[stresses + i] -= g_config.force_0[stresses + i];
            tmpsum += g_config.conf_weight[h] * g_param.sweight *
                      dsquare(forces[stresses + i]);
          }
        }
#endif  // STRESS
        /* limiting constraints per configuration */
        tmpsum += g_config.conf_weight[h] * dsquare(forces[g_calc.limit_p + h]);
      } /* end M A I N loop over configurations */
    }   /* parallel region */
#if defined(MPI)
    /* Reduce rho_sum */
    MPI_Reduce(&rho_sum_loc, &rho_sum, 1, MPI_DOUBLE, MPI_SUM, 0,
               MPI_COMM_WORLD);
#else   /* MPI */
    rho_sum = rho_sum_loc;
#endif  // MPI

/* dummy constraints (global) */
#if defined(APOT)
    /* add punishment for out of bounds (mostly for powell_lsq) */
    if (g_mpi.myid == 0) {
      tmpsum += apot_punish(xi_opt, forces);
    }
#endif  // APOT

#if !defined(NOPUNISH)
    if (g_mpi.myid == 0) {
      int g;
      for (g = 0; g < g_param.ntypes; g++) {
#if defined(NORESCALE)
        /* clear field */
        forces[g_calc.dummy_p + g_param.ntypes + g] = 0.0; /* Free end... */
        /* NEW: Constraint on U': U'(1.0)=0.0; */
        forces[g_calc.dummy_p + g] =
            DUMMY_WEIGHT *
            splint_grad(&calc_pot, xi, paircol + g_param.ntypes + g, 1.0);
#else   /* NOTHING */
        forces[g_calc.dummy_p + g_param.ntypes + g] = 0.0; /* Free end... */
        /* constraints on U`(n) */
        forces[g_calc.dummy_p + g] =
            DUMMY_WEIGHT *
                (*g_splint_grad)(
                    &g_pot.calc_pot, xi, g_calc.paircol + g_param.ntypes + g,
                    0.5 * (g_pot.calc_pot
                               .begin[g_calc.paircol + g_param.ntypes + g] +
                           g_pot.calc_pot
                               .end[g_calc.paircol + g_param.ntypes + g])) -
            g_config.force_0[g_calc.dummy_p + g];
#endif  // NORESCALE
        tmpsum += dsquare(forces[g_calc.dummy_p + g_param.ntypes + g]);
        tmpsum += dsquare(forces[g_calc.dummy_p + g]);
      } /* loop over types */
#if defined(NORESCALE)
      /* NEW: Constraint on n: <n>=1.0 ONE CONSTRAINT ONLY */
      /* Calculate averages */
      rho_sum /= (double)natoms;
      /* ATTN: if there are invariant potentials, things might be problematic */
      forces[dummy_p + g_param.ntypes] = DUMMY_WEIGHT * (rho_sum - 1.0);
      tmpsum += dsquare(forces[dummy_p + g_param.ntypes]);
#endif  // NORESCALE
    }
#endif  // NOPUNISH

#if defined(MPI)
    /* reduce global sum */
    sum = 0.0;
    MPI_Reduce(&tmpsum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    /* gather forces, energies, stresses */
    if (g_mpi.myid == 0) { /* root node already has data in place */
      /* forces */
      MPI_Gatherv(MPI_IN_PLACE, g_mpi.myatoms, g_mpi.MPI_VECTOR, forces,
                  g_mpi.atom_len, g_mpi.atom_dist, g_mpi.MPI_VECTOR, 0,
                  MPI_COMM_WORLD);
      /* energies */
      MPI_Gatherv(MPI_IN_PLACE, g_mpi.myconf, MPI_DOUBLE,
                  forces + g_calc.energy_p, g_mpi.conf_len, g_mpi.conf_dist,
                  MPI_DOUBLE, 0, MPI_COMM_WORLD);
#if defined(STRESS)
      /* stresses */
      MPI_Gatherv(MPI_IN_PLACE, g_mpi.myconf, g_mpi.MPI_STENS,
                  forces + g_calc.stress_p, g_mpi.conf_len, g_mpi.conf_dist,
                  g_mpi.MPI_STENS, 0, MPI_COMM_WORLD);
#endif  // STRESS
#if !defined(NORESCALE)
      /* punishment constraints */
      MPI_Gatherv(MPI_IN_PLACE, g_mpi.myconf, MPI_DOUBLE,
                  forces + g_calc.limit_p, g_mpi.conf_len, g_mpi.conf_dist,
                  MPI_DOUBLE, 0, MPI_COMM_WORLD);
#endif  // !NORESCALE
    } else {
      /* forces */
      MPI_Gatherv(forces + g_mpi.firstatom * 3, g_mpi.myatoms, g_mpi.MPI_VECTOR,
                  forces, g_mpi.atom_len, g_mpi.atom_dist, g_mpi.MPI_VECTOR, 0,
                  MPI_COMM_WORLD);
      /* energies */
      MPI_Gatherv(forces + g_calc.energy_p + g_mpi.firstconf, g_mpi.myconf,
                  MPI_DOUBLE, forces + g_calc.energy_p, g_mpi.conf_len,
                  g_mpi.conf_dist, MPI_DOUBLE, 0, MPI_COMM_WORLD);
#if defined(STRESS)
      /* stresses */
      MPI_Gatherv(forces + g_calc.stress_p + 6 * g_mpi.firstconf, g_mpi.myconf,
                  g_mpi.MPI_STENS, forces + g_calc.stress_p, g_mpi.conf_len,
                  g_mpi.conf_dist, g_mpi.MPI_STENS, 0, MPI_COMM_WORLD);
#endif  // STRESS
#if !defined(NORESCALE)
      /* punishment constraints */
      MPI_Gatherv(forces + g_calc.limit_p + g_mpi.firstconf, g_mpi.myconf,
                  MPI_DOUBLE, forces + g_calc.limit_p, g_mpi.conf_len,
                  g_mpi.conf_dist, MPI_DOUBLE, 0, MPI_COMM_WORLD);
#endif  // !NORESCALE
    }
/* no need to pick up dummy constraints - they are already @ root */
#else
    sum = tmpsum; /* global sum = local sum  */
#endif  // MPI

    /* root process exits this function now */
    if (g_mpi.myid == 0) {
      g_calc.fcalls++; /* Increase function call counter */
      if (isnan(sum)) {
#if defined(DEBUG)
        printf("\n--> Force is nan! <--\n\n");
#endif  // DEBUG
        return 10e10;
      } else
        return sum;
    }
  }

  /* once a non-root process arrives here, all is done. */
  return -1.0;
}
Пример #18
0
void EXX_Fock_Band(
  dcomplex **H,
  EXX_t *exx,
  dcomplex ****exx_CDM, 
  int    *MP,
  double k1,  
  double k2,  
  double k3,
  int    spin
)
{
  double *local_H, *mpi_H;
  int H_n, nbuf;
  int i, j, n, ir, nr, irn, nep;

  int iop1, iop2, nb1, nb2, nb3, nb4, nrn;
  int ia1, ia2, ia3, ia4, ib1, ib2, ib3, ib4, icell1, icell2;
  int ib1_ep, ib2_ep, ib3_ep, ib4_ep;
  int nb1_ep, nb2_ep, nb3_ep, nb4_ep;
  const int *ep_atom1, *ep_atom2, *ep_cell, *atom_nb;
  const int *op_atom1, *op_atom2;
 
  int iep1[8], iep2[8], icell3[8], mul;
  double w;

  int GA_AN, Anum, GB_AN, Bnum, tnoA, tnoB;
  
  int     neri;
  double *eri_list;
  int *iRm;

  double eri;
  dcomplex den;
   
  int nproc, myid, iproc;
  MPI_Status stat;

  int ncd, nshell_ep;
  int iRn_x, iRn_y, iRn_z, iRmp_x, iRmp_y, iRmp_z;
  double kRn, kRmp, co1, si1, co2, si2;

  MPI_Comm comm;
  double *k1_list, *k2_list, *k3_list;
  int *spin_list;

  comm = g_exx_mpicomm;
  
  MPI_Comm_size(comm, &nproc);
  MPI_Comm_rank(comm, &myid);

  k1_list = (double*)malloc(sizeof(double)*nproc);
  k2_list = (double*)malloc(sizeof(double)*nproc);
  k3_list = (double*)malloc(sizeof(double)*nproc);
  spin_list = (int*)malloc(sizeof(int)*nproc);

  /* all-to-all */
  MPI_Allgather(&k1,   1, MPI_DOUBLE, k1_list,   1, MPI_DOUBLE, comm);
  MPI_Allgather(&k2,   1, MPI_DOUBLE, k2_list,   1, MPI_DOUBLE, comm);
  MPI_Allgather(&k3,   1, MPI_DOUBLE, k3_list,   1, MPI_DOUBLE, comm);
  MPI_Allgather(&spin, 1, MPI_INT,    spin_list, 1, MPI_INT,    comm);

  op_atom1 = EXX_Array_OP_Atom1(exx);
  op_atom2 = EXX_Array_OP_Atom2(exx);
  ep_atom1 = EXX_Array_EP_Atom1(exx);
  ep_atom2 = EXX_Array_EP_Atom2(exx);
  ep_cell  = EXX_Array_EP_Cell(exx);
  atom_nb  = EXX_atom_nb(exx);
  nshell_ep = EXX_Number_of_EP_Shells(exx);

  ncd = 2*nshell_ep+1;

  /* matrix size */
  H_n = 0; 
  for (i=1; i<=atomnum; i++){ H_n += Spe_Total_CNO[WhatSpecies[i]]; }
 
  /* allocation */
  nbuf = H_n*H_n*2;
  local_H = (double*)malloc(sizeof(double)*nbuf);
  mpi_H   = (double*)malloc(sizeof(double)*nbuf);

  nr = EXX_File_ERI_Read_NRecord(exx);

  /* clear buffer */
  for (i=0; i<nbuf; i++) { local_H[i] = 0.0; }

  for (ir=0; ir<nr; ir++) {
    EXX_File_ERI_Read_Data_Head(exx, ir, &iop1, &iop2,
      &nb1, &nb2, &nb3, &nb4, &nrn);

    neri = nb1*nb2*nb3*nb4*nrn; 
    eri_list = (double*)malloc(sizeof(double)*neri);
    iRm = (int*)malloc(sizeof(int)*nrn); /* Rm */
    EXX_File_ERI_Read(exx, ir, eri_list, iRm, iop1, iop2, nrn, neri);
 
    for (iproc=0; iproc<nproc; iproc++) {
      /* clear buffer */
      for (i=0; i<nbuf; i++) { mpi_H[i] = 0.0; } 

      k1   = k1_list[iproc];
      k2   = k2_list[iproc];
      k3   = k3_list[iproc];
      spin = spin_list[iproc];
      
      for (irn=0; irn<nrn; irn++) {
        EXX_OP2EP_Band(exx, iop1, iop2, iRm[irn], 
          &iep1[0], &iep2[0], &icell3[0], &mul);
        w = 1.0/(double)mul; 

        for (j=0; j<8; j++) {
          ia1 = ep_atom1[iep1[j]]; /* i */
          ia2 = ep_atom2[iep1[j]]; /* j */
          ia3 = ep_atom1[iep2[j]]; /* k */
          ia4 = ep_atom2[iep2[j]]; /* l */
          icell1 = ep_cell[iep1[j]]; /* Rn */
          icell2 = ep_cell[iep2[j]]; /* Rm' */
 
          nb1_ep = atom_nb[ia1];
          nb2_ep = atom_nb[ia2];
          nb3_ep = atom_nb[ia3];
          nb4_ep = atom_nb[ia4];
      
          GA_AN = ia1+1;
          Anum = MP[GA_AN];
        
          GB_AN = ia2+1;
          Bnum = MP[GB_AN];

          /* phase */
          iRn_x = icell1%ncd - nshell_ep;
          iRn_y = (icell1/ncd)%ncd - nshell_ep;
          iRn_z = (icell1/ncd/ncd)%ncd - nshell_ep;
          kRn = k1*(double)iRn_x + k2*(double)iRn_y + k3*(double)iRn_z;
          si1 = sin(2.0*PI*kRn);
          co1 = cos(2.0*PI*kRn);

          iRmp_x = icell2%ncd - nshell_ep;
          iRmp_y = (icell2/ncd)%ncd - nshell_ep;
          iRmp_z = (icell2/ncd/ncd)%ncd - nshell_ep;

          kRmp = k1*(double)iRmp_x + k2*(double)iRmp_y + k3*(double)iRmp_z;
          si2 = sin(2.0*PI*kRmp);
          co2 = cos(2.0*PI*kRmp);
 
          for (ib1_ep=0; ib1_ep<nb1_ep; ib1_ep++) {
            for (ib2_ep=0; ib2_ep<nb2_ep; ib2_ep++) {
              for (ib3_ep=0; ib3_ep<nb3_ep; ib3_ep++) {
                for (ib4_ep=0; ib4_ep<nb4_ep; ib4_ep++) {
                  EXX_Basis_Index(ib1_ep, ib2_ep, ib3_ep, ib4_ep,
                    &ib1, &ib2, &ib3, &ib4, j);
                  i = (((ib1*nb2+ib2)*nb3+ib3)*nb4+ib4)*nrn+irn;
                  eri = eri_list[i];
                  den.r = exx_CDM[spin][iep2[j]][ib3_ep][ib4_ep].r;
                  den.i = exx_CDM[spin][iep2[j]][ib3_ep][ib4_ep].i;
                  i = (Anum+ib1_ep-1)*H_n + (Bnum+ib2_ep-1);
                  mpi_H[2*i+0] += -w*eri*(co1*den.r - si1*den.i); /* real */
                  mpi_H[2*i+1] += -w*eri*(co1*den.i + si1*den.r); /* imag */
                }
              }
  	    }
          }
        } /* loop of j */
      } /* loop of irn */ 

      /* reduce to the iproc-th proc */
      MPI_Reduce(mpi_H, mpi_H, nbuf, MPI_DOUBLE, MPI_SUM, iproc, comm);

      if (myid==iproc) {
        for (i=0; i<nbuf; i++) { local_H[i] = mpi_H[i]; }
      }
    } /* loop of iproc */

    free(eri_list);
    free(iRm);
  } /* loop of irn */


#if 1
  for (GA_AN=1; GA_AN<=atomnum; GA_AN++) {
    tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]];
    Anum = MP[GA_AN];
    for (GB_AN=1; GB_AN<=atomnum; GB_AN++) {
      tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]];
      Bnum = MP[GB_AN];
      for (ib1=0; ib1<tnoA; ib1++) {
        for (ib2=0; ib2<tnoB; ib2++) {
          i = (Anum+ib1-1)*H_n + (Bnum+ib2-1);
          H[Anum+ib1][Bnum+ib2].r += local_H[2*i+0];
          H[Anum+ib1][Bnum+ib2].i += local_H[2*i+1];
        }
      }
    }
  }
#endif

  /* free*/
  free(local_H); 
  free(mpi_H); 
}
Пример #19
0
void step3_gpu(int *n) {

  int nprocs, procid;
  MPI_Comm_rank(MPI_COMM_WORLD, &procid);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  /* Create Cartesian Communicator */
  int c_dims[2]={0};
  MPI_Comm c_comm;
  accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);

  Complexf *data, *data_cpu;
  Complexf *data_hat;
  double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
  int alloc_max=0;

  int isize[3],osize[3],istart[3],ostart[3];
  /* Get the local pencil size and the allocation size */
  alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm);

#ifdef INPLACE
  data_cpu=(Complexf*)malloc(alloc_max);
  cudaMalloc((void**) &data, alloc_max);
#else
  data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float));
  cudaMalloc((void**) &data_hat, alloc_max);
#endif

  //accfft_init(nthreads);
  setup_time=-MPI_Wtime();

  /* Create FFT plan */
#ifdef INPLACE
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE);
#else
  accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
  setup_time+=MPI_Wtime();

  /* Warmup Runs */
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif

  /*  Initialize data */
  initialize(data_cpu,n,c_comm);
#ifdef INPLACE
  cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);
#else
  cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice);
#endif

  MPI_Barrier(c_comm);


  /* Perform forward FFT */
  f_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
  f_time+=MPI_Wtime();

  MPI_Barrier(c_comm);

#ifndef INPLACE
  Complexf *data2_cpu, *data2;
  cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float));
  data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
#endif

  /* Perform backward FFT */
  i_time-=MPI_Wtime();
#ifdef INPLACE
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data);
#else
  accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
  i_time+=MPI_Wtime();

  /* copy back results on CPU and check error*/
#ifdef INPLACE
  cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
  check_err(data_cpu,n,c_comm);
#else
  cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost);
  check_err(data2_cpu,n,c_comm);
#endif


  /* Compute some timings statistics */
  double g_f_time, g_i_time, g_setup_time;
  MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
  MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);

#ifdef INPLACE
  PCOUT<<"GPU Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#else
  PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl;
#endif
  PCOUT<<"Setup \t"<<g_setup_time<<std::endl;
  PCOUT<<"FFT \t"<<g_f_time<<std::endl;
  PCOUT<<"IFFT \t"<<g_i_time<<std::endl;

  MPI_Barrier(c_comm);
  cudaDeviceSynchronize();
  free(data_cpu);
  cudaFree(data);
#ifndef INPLACE
  cudaFree(data_hat);
  free(data2_cpu);
  cudaFree(data2);
#endif
  accfft_destroy_plan_gpu(plan);
  accfft_cleanup_gpuf();
  MPI_Comm_free(&c_comm);
  return ;

} // end step3_gpu
Пример #20
0
/*
 * This test looks at the handling of logical and for types that are not 
 * integers or are not required integers (e.g., long long).  MPICH allows
 * these as well.  A strict MPI test should not include this test.
 */
int main( int argc, char *argv[] )
{
    int errs = 0;
    int rc;
    int rank, size;
    MPI_Comm      comm;
    char cinbuf[3], coutbuf[3];
    signed char scinbuf[3], scoutbuf[3];
    unsigned char ucinbuf[3], ucoutbuf[3];
    short sinbuf[3], soutbuf[3];
    unsigned short usinbuf[3], usoutbuf[3];
    long linbuf[3], loutbuf[3];
    unsigned long ulinbuf[3], uloutbuf[3];
    unsigned uinbuf[3], uoutbuf[3];
    int iinbuf[3], ioutbuf[3];
    

    MTest_Init( &argc, &argv );

    comm = MPI_COMM_WORLD;
    /* Set errors return so that we can provide better information 
       should a routine reject one of the operand/datatype pairs */
    MPI_Errhandler_set( comm, MPI_ERRORS_RETURN );

    MPI_Comm_rank( comm, &rank );
    MPI_Comm_size( comm, &size );

#ifndef USE_STRICT_MPI
    /* char */
    MTestPrintfMsg( 10, "Reduce of MPI_CHAR\n" );
    cinbuf[0] = 0xff;
    cinbuf[1] = 0;
    cinbuf[2] = (rank > 0) ? 0x3c : 0xc3;

    coutbuf[0] = 0xf;
    coutbuf[1] = 1;
    coutbuf[2] = 1;
    rc = MPI_Reduce( cinbuf, coutbuf, 3, MPI_CHAR, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_CHAR", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (coutbuf[0] != ((size % 2) ? (char)0xff : (char)0) ) {
		errs++;
		fprintf( stderr, "char BXOR(1) test failed\n" );
	    }
	    if (coutbuf[1]) {
		errs++;
		fprintf( stderr, "char BXOR(0) test failed\n" );
	    }
	    if (coutbuf[2] != ((size % 2) ? (char)0xc3 : (char)0xff)) {
		errs++;
		fprintf( stderr, "char BXOR(>) test failed\n" );
	    }
	}
    }
#endif /* USE_STRICT_MPI */

    /* signed char */
    MTestPrintfMsg( 10, "Reduce of MPI_SIGNED_CHAR\n" );
    scinbuf[0] = 0xff;
    scinbuf[1] = 0;
    scinbuf[2] = (rank > 0) ? 0x3c : 0xc3;

    scoutbuf[0] = 0xf;
    scoutbuf[1] = 1;
    scoutbuf[2] = 1;
    rc = MPI_Reduce( scinbuf, scoutbuf, 3, MPI_SIGNED_CHAR, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_SIGNED_CHAR", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (scoutbuf[0] != ((size % 2) ? (signed char)0xff : (signed char)0) ) {
		errs++;
		fprintf( stderr, "signed char BXOR(1) test failed\n" );
	    }
	    if (scoutbuf[1]) {
		errs++;
		fprintf( stderr, "signed char BXOR(0) test failed\n" );
	    }
	    if (scoutbuf[2] != ((size % 2) ? (signed char)0xc3 : (signed char)0xff)) {
		errs++;
		fprintf( stderr, "signed char BXOR(>) test failed\n" );
	    }
	}
    }

    /* unsigned char */
    MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED_CHAR\n" );
    ucinbuf[0] = 0xff;
    ucinbuf[1] = 0;
    ucinbuf[2] = (rank > 0) ? 0x3c : 0xc3;

    ucoutbuf[0] = 0;
    ucoutbuf[1] = 1;
    ucoutbuf[2] = 1;
    rc = MPI_Reduce( ucinbuf, ucoutbuf, 3, MPI_UNSIGNED_CHAR, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED_CHAR", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (ucoutbuf[0] != ((size % 2) ? 0xff : 0)) {
		errs++;
		fprintf( stderr, "unsigned char BXOR(1) test failed\n" );
	    }
	    if (ucoutbuf[1]) {
		errs++;
		fprintf( stderr, "unsigned char BXOR(0) test failed\n" );
	    }
	    if (ucoutbuf[2] != ((size % 2) ? (unsigned char)0xc3 : (unsigned char)0xff)) {
		errs++;
		fprintf( stderr, "unsigned char BXOR(>) test failed\n" );
	    }
	}
    }

    /* bytes */
    MTestPrintfMsg( 10, "Reduce of MPI_BYTE\n" );
    cinbuf[0] = 0xff;
    cinbuf[1] = 0;
    cinbuf[2] = (rank > 0) ? 0x3c : 0xc3;

    coutbuf[0] = 0;
    coutbuf[1] = 1;
    coutbuf[2] = 1;
    rc = MPI_Reduce( cinbuf, coutbuf, 3, MPI_BYTE, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_BYTE", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (coutbuf[0] != ((size % 2) ? (char)0xff : 0)) {
		errs++;
		fprintf( stderr, "byte BXOR(1) test failed\n" );
	    }
	    if (coutbuf[1]) {
		errs++;
		fprintf( stderr, "byte BXOR(0) test failed\n" );
	    }
	    if (coutbuf[2] != ((size % 2) ? (char)0xc3 : (char)0xff)) {
		errs++;
		fprintf( stderr, "byte BXOR(>) test failed\n" );
	    }
	}
    }

    /* short */
    MTestPrintfMsg( 10, "Reduce of MPI_SHORT\n" );
    sinbuf[0] = 0xffff;
    sinbuf[1] = 0;
    sinbuf[2] = (rank > 0) ? 0x3c3c : 0xc3c3;

    soutbuf[0] = 0;
    soutbuf[1] = 1;
    soutbuf[2] = 1;
    rc = MPI_Reduce( sinbuf, soutbuf, 3, MPI_SHORT, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_SHORT", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (soutbuf[0] != ((size % 2) ? (short)0xffff : 0)) {
		errs++;
		fprintf( stderr, "short BXOR(1) test failed\n" );
	    }
	    if (soutbuf[1]) {
		errs++;
		fprintf( stderr, "short BXOR(0) test failed\n" );
	    }
	    if (soutbuf[2] != ((size % 2) ? (short)0xc3c3 : (short)0xffff)) {
		errs++;
		fprintf( stderr, "short BXOR(>) test failed\n" );
	    }
	}
    }

    /* unsigned short */
    MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED_SHORT\n" );
    usinbuf[0] = 0xffff;
    usinbuf[1] = 0;
    usinbuf[2] = (rank > 0) ? 0x3c3c : 0xc3c3;

    usoutbuf[0] = 0;
    usoutbuf[1] = 1;
    usoutbuf[2] = 1;
    rc = MPI_Reduce( usinbuf, usoutbuf, 3, MPI_UNSIGNED_SHORT, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED_SHORT", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (usoutbuf[0] != ((size % 2) ? 0xffff : 0)) {
		errs++;
		fprintf( stderr, "short BXOR(1) test failed\n" );
	    }
	    if (usoutbuf[1]) {
		errs++;
		fprintf( stderr, "short BXOR(0) test failed\n" );
	    }
	    if (usoutbuf[2] != ((size % 2) ? 0xc3c3 : 0xffff)) {
		errs++;
		fprintf( stderr, "short BXOR(>) test failed\n" );
	    }
	}
    }

    /* unsigned */
    MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED\n" );
    uinbuf[0] = 0xffffffff;
    uinbuf[1] = 0;
    uinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3;

    uoutbuf[0] = 0;
    uoutbuf[1] = 1;
    uoutbuf[2] = 1;
    rc = MPI_Reduce( uinbuf, uoutbuf, 3, MPI_UNSIGNED, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (uoutbuf[0] != ((size % 2) ? 0xffffffff : 0)) {
		errs++;
		fprintf( stderr, "unsigned BXOR(1) test failed\n" );
	    }
	    if (uoutbuf[1]) {
		errs++;
		fprintf( stderr, "unsigned BXOR(0) test failed\n" );
	    }
	    if (uoutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) {
		errs++;
		fprintf( stderr, "unsigned BXOR(>) test failed\n" );
	    }
	}
    }

    /* int */
    MTestPrintfMsg( 10, "Reduce of MPI_INT\n" );
    iinbuf[0] = 0xffffffff;
    iinbuf[1] = 0;
    iinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3;

    ioutbuf[0] = 0;
    ioutbuf[1] = 1;
    ioutbuf[2] = 1;
    rc = MPI_Reduce( iinbuf, ioutbuf, 3, MPI_INT, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_INT", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (ioutbuf[0] != ((size % 2) ? 0xffffffff : 0)) {
		errs++;
		fprintf( stderr, "int BXOR(1) test failed\n" );
	    }
	    if (ioutbuf[1]) {
		errs++;
		fprintf( stderr, "int BXOR(0) test failed\n" );
	    }
	    if (ioutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) {
		errs++;
		fprintf( stderr, "int BXOR(>) test failed\n" );
	    }
	}
    }

    /* long */
    MTestPrintfMsg( 10, "Reduce of MPI_LONG\n" );
    linbuf[0] = 0xffffffff;
    linbuf[1] = 0;
    linbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3;

    loutbuf[0] = 0;
    loutbuf[1] = 1;
    loutbuf[2] = 1;
    rc = MPI_Reduce( linbuf, loutbuf, 3, MPI_LONG, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_LONG", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (loutbuf[0] != ((size % 2) ? 0xffffffff : 0)) {
		errs++;
		fprintf( stderr, "long BXOR(1) test failed\n" );
	    }
	    if (loutbuf[1]) {
		errs++;
		fprintf( stderr, "long BXOR(0) test failed\n" );
	    }
	    if (loutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) {
		errs++;
		fprintf( stderr, "long BXOR(>) test failed\n" );
	    }
	}
    }

    /* unsigned long */
    MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED_LONG\n" );
    ulinbuf[0] = 0xffffffff;
    ulinbuf[1] = 0;
    ulinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3;

    uloutbuf[0] = 0;
    uloutbuf[1] = 1;
    uloutbuf[2] = 1;
    rc = MPI_Reduce( ulinbuf, uloutbuf, 3, MPI_UNSIGNED_LONG, MPI_BXOR, 0, comm );
    if (rc) {
	MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED_LONG", rc );
	errs++;
    }
    else {
	if (rank == 0) {
	    if (uloutbuf[0] != ((size % 2) ? 0xffffffff : 0)) {
		errs++;
		fprintf( stderr, "unsigned long BXOR(1) test failed\n" );
	    }
	    if (uloutbuf[1]) {
		errs++;
		fprintf( stderr, "unsigned long BXOR(0) test failed\n" );
	    }
	    if (uloutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) {
		errs++;
		fprintf( stderr, "unsigned long BXOR(>) test failed\n" );
	    }
	}
    }

#ifdef HAVE_LONG_LONG
    {
	long long llinbuf[3], lloutbuf[3];
    /* long long */
    llinbuf[0] = 0xffffffff;
    llinbuf[1] = 0;
    llinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3;

    lloutbuf[0] = 0;
    lloutbuf[1] = 1;
    lloutbuf[2] = 1;
    if (MPI_LONG_LONG != MPI_DATATYPE_NULL) {
	MTestPrintfMsg( 10, "Reduce of MPI_LONG_LONG\n" );
	rc = MPI_Reduce( llinbuf, lloutbuf, 3, MPI_LONG_LONG, MPI_BXOR, 0, comm );
	if (rc) {
	    MTestPrintErrorMsg( "MPI_BXOR and MPI_LONG_LONG", rc );
	    errs++;
	}
	else {
	    if (rank == 0) {
		if (lloutbuf[0] != ((size % 2) ? 0xffffffff : 0)) {
		    errs++;
		    fprintf( stderr, "long long BXOR(1) test failed\n" );
		}
		if (lloutbuf[1]) {
		    errs++;
		    fprintf( stderr, "long long BXOR(0) test failed\n" );
		}
		if (lloutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) {
		    errs++;
		    fprintf( stderr, "long long BXOR(>) test failed\n" );
		}
	    }
	}
    }
    }
#endif

    MPI_Errhandler_set( comm, MPI_ERRORS_ARE_FATAL );
    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
Пример #21
0
int main(int argc, char *argv[])
{
  int rank;
  int size;
  int i;
  int status;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  unsigned long long* sb = (unsigned long long *) xbt_malloc(size * sizeof(unsigned long long));
  unsigned long long* rb = (unsigned long long *) xbt_malloc(size * sizeof(unsigned long long));

  for (i = 0; i < size; ++i) {
    sb[i] = rank*size + i;
    rb[i] = 0;
  }
  printf("[%d] sndbuf=[", rank);
  for (i = 0; i < size; i++)
    printf("%llu ", sb[i]);
  printf("]\n");

  int root=0;
  status = MPI_Reduce(sb, rb, size, MPI_UNSIGNED_LONG_LONG, MPI_SUM, root, MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == root) {
    printf("[%d] rcvbuf=[", rank);
    for (i = 0; i < size; i++)
      printf("%llu ", rb[i]);
    printf("]\n");
    if (status != MPI_SUCCESS) {
      printf("all_to_all returned %d\n", status);
      fflush(stdout);
    }
  }

  printf("[%d] second sndbuf=[", rank);
  for (i = 0; i < 1; i++)
    printf("%llu ", sb[i]);
  printf("]\n");

  root=size-1;
  status = MPI_Reduce(sb, rb, 1, MPI_UNSIGNED_LONG_LONG, MPI_PROD, root, MPI_COMM_WORLD);
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == root) {
    printf("[%d] rcvbuf=[", rank);
    for (i = 0; i < 1; i++)
      printf("%llu ", rb[i]);
    printf("]\n");
    if (status != MPI_SUCCESS) {
      printf("all_to_all returned %d\n", status);
      fflush(stdout);
    }
  }
  free(sb);
  free(rb);
  MPI_Finalize();
  return (EXIT_SUCCESS);
}
Пример #22
0
void pion_norm(const int traj, const int id) {
  int i, j, z, zz, z0;
  double *Cpp;
  double res = 0.;
  double pionnorm;
  double atime, etime;
  float tmp;
#ifdef MPI
  double mpi_res = 0.;
#endif
  FILE *ofs, *ofs2;
  char *filename, *filename2, *sourcefilename;
  char buf[100];
  char buf2[100];
  char buf3[100];
  filename=buf;
  filename2=buf2;
  sourcefilename=buf3;
  sprintf(filename,"pionnormcorrelator_finiteT.%.6d",traj);
  sprintf(filename2,"%s", "pion_norm.data");

  /* generate random source point */
  if(ranlxs_init == 0) {
    rlxs_init(1, 123456);
  }
  ranlxs(&tmp, 1);
  z0 = (int)(measurement_list[id].max_source_slice*tmp);
#ifdef MPI
  MPI_Bcast(&z0, 1, MPI_INT, 0, MPI_COMM_WORLD);
#endif

#ifdef MPI
  atime = MPI_Wtime();
#else
  atime = (double)clock()/(double)(CLOCKS_PER_SEC);
#endif

  Cpp = (double*) calloc(g_nproc_z*LZ, sizeof(double));

  printf("Doing finite Temperature online measurement\n");
  
  /* stochastic source in z-slice */
  source_generation_pion_zdir(g_spinor_field[0], g_spinor_field[1], 
                            z0, 0, traj);
  

  /* invert on the stochastic source */
  invert_eo(g_spinor_field[2], g_spinor_field[3], 
            g_spinor_field[0], g_spinor_field[1],
            1.e-14, measurement_list[id].max_iter, CG, 1, 0, 1, 0, NULL, -1);

  /* now we bring it to normal format */
  /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */
  convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]);
  
  /* now we sums only over local space for every z */
  for(z = 0; z < LZ; z++) {
    res = 0.;
    /* sum here over all points in one z-slice 
       we have to look up g_ipt*/

    j = g_ipt[0][0][0][z];
    for(i = 0; i < T*LX*LY ; i++) {
           res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][j], g_spinor_field[DUM_MATRIX][j]);
           j += LZ; /* jump LZ sites in array, z ist fastest index */
    }


    
#if defined MPI
    MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_z_slices);
    res = mpi_res;
#endif
    Cpp[z+g_proc_coords[3]*LZ] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_t*T)*2.;
  }

#ifdef MPI
  /* some gymnastics needed in case of parallelisation */
  if(g_mpi_z_rank == 0) {
    MPI_Gather(&Cpp[g_proc_coords[3]*LZ], LZ, MPI_DOUBLE, Cpp, LZ, MPI_DOUBLE, 0, g_mpi_ST_slices);
  }
#endif


  /* and write everything into a file */
  if(g_mpi_z_rank == 0 && g_proc_coords[3] == 0) {
    ofs = fopen(filename, "w");
    fprintf( ofs, "1  1  0  %e  %e\n", Cpp[z0], 0.);
    for(z = 1; z < g_nproc_z*LZ/2; z++) {
      zz = (z0+z)%(g_nproc_z*LZ);
      fprintf( ofs, "1  1  %d  %e  ", z, Cpp[zz]);
      zz = (z0+g_nproc_z*LZ-z)%(g_nproc_z*LZ);
      fprintf( ofs, "%e\n", Cpp[zz]);
    }
    zz = (z0+g_nproc_z*LZ/2)%(g_nproc_z*LZ);
    fprintf( ofs, "1  1  %d  %e  %e\n", z, Cpp[zz], 0.);
    fclose(ofs);
    
    /* sum over all Cpp to get pionnorm*/
    ofs2 = fopen(filename2, "a");
    pionnorm = 0.;
    for(z=0; z<g_nproc_z*LZ; z++){
      pionnorm += Cpp[z];
    }
    /* normalize */
    pionnorm = pionnorm/(g_nproc_z*LZ); 
    fprintf(ofs2,"%d\t %.16e\n",traj,pionnorm);
    fclose(ofs2);
  }
  
  free(Cpp);
#ifdef MPI
  etime = MPI_Wtime();
#else
  etime = (double)clock()/(double)(CLOCKS_PER_SEC);
#endif
  if(g_proc_id == 0 && g_debug_level > 0) {
    printf("PIONNORM : measurement done int t/s = %1.4e\n", etime - atime);
  }
  return;
}
Пример #23
0
void compute_error_p3dfft(double * A,int* n)
{
  double pi=4*atan(1.0);
  long long int size=n[0];
  size*=n[1]; size*=n[2];
  int istart[3],iend[3],isize[3];
  int xx=0.,yy=0.,zz=0.;
  //double an,aa;
  Cp3dfft_get_dims(istart,iend,isize,1);
  int num_th= omp_get_max_threads();
  double diff_th[num_th],norm_th[num_th];
  for (int i=0;i<num_th;i++){
   diff_th[i]=0.;norm_th[i]=0.;
  }


  int procid;
  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
  //First Determine the constant difference between analytic and numerical solutions

#pragma omp parallel
  {
    long int ptr;
    double X,Y,Z;
    double analytic_solution,diff=0.0;
    int thid=omp_get_thread_num();
    //  double aa=0.,an=0.;
#pragma omp for
    for (int k=0; k<isize[2]; k++){
      for (int j=0; j<isize[1]; j++){
        for (int i=0; i<isize[0]; i++){
          X=2*pi/n[0]*(istart[0]-1+i);
          Y=2*pi/n[1]*(istart[1]-1+j);
          Z=2*pi/n[2]*(istart[2]-1+k);
          ptr=i+isize[0]*j+isize[0]*isize[1]*k;
          analytic_solution=testcase(X,Y,Z);
          diff=std::abs(A[ptr]/size-analytic_solution);

          diff_th[thid]+=diff;
          norm_th[thid]+=analytic_solution;
          //std::cout<<"x,y,z= "<<X<<Y<<Z<<" diff=  "<<diff<<" analytic= "<<analytic_solution<<" numerical=  "<<A[ptr]/size<<std::endl;

        }
      }
    }
  }
  double DIFF=0.,NORM=0.;
  for (int i=0; i<num_th;i++){
    DIFF+=diff_th[i];
    NORM+=norm_th[i];

  }

  double gerr=0,gnorm=0;
  MPI_Reduce(&DIFF,&gerr,1, MPI_DOUBLE, MPI_SUM,0, MPI_COMM_WORLD);
  MPI_Reduce(&NORM,&gnorm,1, MPI_DOUBLE, MPI_SUM,0, MPI_COMM_WORLD);
  PCOUT<<"The L1 error between iFF(a)-a is == "<<gerr<<std::endl;
  PCOUT<<"The Rel. L1 error between iFF(a)-a is == "<<gerr/gnorm<<std::endl;
  if(gerr/gnorm>1e-10){
      PCOUT<<"\033[1;31m ERROR!!! FFT not computed correctly!\033[0m"<<std::endl;
  }
  else{
      PCOUT<<"\033[1;36m FFT computed correctly!\033[0m"<<std::endl;
  }

}
Пример #24
0
void reb_output_timing(struct reb_simulation* r, const double tmax){
	const int N = r->N;
#ifdef MPI
	int N_tot = 0;
	MPI_Reduce(&N, &N_tot, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); 
	if (r->mpi_id!=0) return;
#else
	int N_tot = N;
#endif
	struct timeval tim;
	gettimeofday(&tim, NULL);
	double temp = tim.tv_sec+(tim.tv_usec/1000000.0);
	if (r->output_timing_last==-1){
		r->output_timing_last = temp;
	}else{
		printf("\r");
#ifdef PROFILING
		fputs("\033[A\033[2K",stdout);
		for (int i=0;i<=PROFILING_CAT_NUM;i++){
			fputs("\033[A\033[2K",stdout);
		}
#endif // PROFILING
	}
	printf("N_tot= %- 9d  ",N_tot);
	if (r->integrator==REB_INTEGRATOR_SEI){
		printf("t= %- 9f [orb]  ",r->t*r->ri_sei.OMEGA/2./M_PI);
	}else{
		printf("t= %- 9f  ",r->t);
	}
	printf("dt= %- 9f  ",r->dt);
	if (r->integrator==REB_INTEGRATOR_HYBRID){
		printf("INT= %- 1d  ",r->ri_hybrid.mode);
	}
	printf("cpu= %- 9f [s]  ",temp-r->output_timing_last);
	if (tmax>0){
		printf("t/tmax= %5.2f%%",r->t/tmax*100.0);
	}
#ifdef PROFILING
	if (profiling_timing_initial==0){
		struct timeval tim;
		gettimeofday(&tim, NULL);
		profiling_timing_initial = tim.tv_sec+(tim.tv_usec/1000000.0);
	}
	printf("\nCATEGORY       TIME \n");
	double _sum = 0;
	for (int i=0;i<=PROFILING_CAT_NUM;i++){
		switch (i){
			case PROFILING_CAT_INTEGRATOR:
				printf("Integrator     ");
				break;
			case PROFILING_CAT_BOUNDARY:
				printf("Boundary check ");
				break;
			case PROFILING_CAT_GRAVITY:
				printf("Gravity/Forces ");
				break;
			case PROFILING_CAT_COLLISION:
				printf("Collisions     ");
				break;
#ifdef OPENGL
			case PROFILING_CAT_VISUALIZATION:
				printf("Visualization  ");
				break;
#endif // OPENGL
			case PROFILING_CAT_NUM:
				printf("Other          ");
				break;
		}
		if (i==PROFILING_CAT_NUM){
			printf("%5.2f%%",(1.-_sum/(profiling_time_final - profiling_timing_initial))*100.);
		}else{
			printf("%5.2f%%\n",profiling_time_sum[i]/(profiling_time_final - profiling_timing_initial)*100.);
			_sum += profiling_time_sum[i];
		}
	}
#endif // PROFILING
	fflush(stdout);
	r->output_timing_last = temp;
}
Пример #25
0
int main (int argc, char * argv[]) {
    int rank,size;
    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&size);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);

    int X,Y,x,y,X_ext,i;    
    double **A, **localA;
    X=atoi(argv[1]);
    Y=X;

    //Extend dimension X with ghost cells if X%size!=0
    if (X%size!=0)
        X_ext=X+size-X%size;
    else
        X_ext=X;
      

    if (rank==0) {
        //Allocate and init matrix A
        A=malloc2D(X_ext,Y);
        init2D(A,X,Y);
    }

    //Local dimensions x,y
    x=X_ext/size;
    y=Y;

    //Allocate local matrix and scatter global matrix
    localA=malloc2D(x,y);
    double * idx;
    for (i=0;i<x;i++) {
        if (rank==0)
            idx=&A[i*size][0];            
        MPI_Scatter(idx,Y,MPI_DOUBLE,&localA[i][0],y,MPI_DOUBLE,0,MPI_COMM_WORLD);
    }
    if (rank==0)
        free2D(A,X_ext,Y);
 
    //Timers   
    struct timeval ts,tf,comps,compf,comms,commf;
    double total_time,computation_time,communication_time;

    MPI_Barrier(MPI_COMM_WORLD);
    gettimeofday(&ts,NULL);        

    /******************************************************************************
     The matrix A is distributed in a round-robin fashion to the local matrices localA
     You have to use point-to-point communication routines.
     Don't forget the timers for computation and communication!
        
    ******************************************************************************/
    
    int line_index, line_owner;
    int k, start;
    double *k_row, *temp;
    MPI_Status status;
    
    temp = malloc(y * sizeof(*temp));
//     k_row = malloc(y * sizeof(*k_row));
    
    /* omoia me to allo cyclic, vriskoume ton line_owner */
    for (k=0; k<y-1; k++){
	line_owner = k % size;
	line_index = k / size;
	
	if (rank <= line_owner)
	    start = k / size + 1;
	else
	    start = k / size;
	
	if (rank == line_owner)
	    k_row = localA[line_index];
	else
	    k_row = temp;
	
	/* set communication timer */
	gettimeofday(&comms, NULL);
	
	/* COMM */
	
// 	if (rank != line_owner){
// 	    if (rank == 0)
// 		MPI_Recv( k_row, y, MPI_DOUBLE, size-1, MPI_ANY_SOURCE, MPI_COMM_WORLD, &status);
// 	    else
// 		MPI_Recv( k_row, y, MPI_DOUBLE, rank-1, MPI_ANY_SOURCE, MPI_COMM_WORLD, &status);
// 	}
// 	
// 	/* autos pou einai prin ton line_owner den prepei na steilei */
// 	if (rank != line_owner -1){
// 	    /* o teleutaios prepei na steilei ston prwto, ektos an o prwtos einai o line_owner */
// 	    if (rank == size-1) {
// 		if (line_owner != 0)
// 		    MPI_Send( k_row, y, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD);
// 	    }
// 	    else
// 		MPI_Send(k_row, y, MPI_DOUBLE, rank+1, rank, MPI_COMM_WORLD);
// 	}

	/* o line_owner stelnei se olous (ektos tou eautou tou) kai oloi oi alloi kanoun
	 * receive */
	if (rank == line_owner){
	    for (i=0; i<size; i++)
		if (i != line_owner)
		    MPI_Send( k_row, y, MPI_DOUBLE, i, line_owner, MPI_COMM_WORLD);
	}
	else
	    MPI_Recv(k_row, y, MPI_DOUBLE, line_owner, line_owner, MPI_COMM_WORLD, &status);
	
	/* stop communication timer */
	gettimeofday(&commf, NULL);
	communication_time += commf.tv_sec - comms.tv_sec + (commf.tv_usec - comms.tv_usec)*0.000001;
	
	/* set computation timer */
	gettimeofday(&comps, NULL);
	
	/* Compute */
	go_to_work( localA, k_row, x, y, rank, start, k );
	
	/* stop computation timer */
	gettimeofday(&compf, NULL);
	computation_time += compf.tv_sec - comps.tv_sec + (compf.tv_usec - comps.tv_usec)*0.000001;
    }

    gettimeofday(&tf,NULL);
    total_time=tf.tv_sec-ts.tv_sec+(tf.tv_usec-ts.tv_usec)*0.000001;


    //Gather local matrices back to the global matrix
    if (rank==0) 
        A=malloc2D(X_ext,Y);
    for (i=0;i<x;i++) {
            if (rank==0)
                idx=&A[i*size][0];
            MPI_Gather(&localA[i][0],y,MPI_DOUBLE,idx,Y,MPI_DOUBLE,0,MPI_COMM_WORLD);
    }
    
    double avg_total,avg_comp,avg_comm,max_total,max_comp,max_comm;
    MPI_Reduce(&total_time,&max_total,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
    MPI_Reduce(&computation_time,&max_comp,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
    MPI_Reduce(&communication_time,&max_comm,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
    MPI_Reduce(&total_time,&avg_total,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    MPI_Reduce(&computation_time,&avg_comp,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
    MPI_Reduce(&communication_time,&avg_comm,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);

    avg_total/=size;
    avg_comp/=size;
    avg_comm/=size;

    if (rank==0) {
        printf("LU-Cyclic-p2p\tSize\t%d\tProcesses\t%d\n",X,size);
        printf("Max times:\tTotal\t%lf\tComp\t%lf\tComm\t%lf\n",max_total,max_comp,max_comm);
        printf("Avg times:\tTotal\t%lf\tComp\t%lf\tComm\t%lf\n",avg_total,avg_comp,avg_comm);
    }

    //Print triangular matrix U to file
    if (rank==0) {
        char * filename="output_cyclic_p2p";
        print2DFile(A,X,Y,filename);
    }


    MPI_Finalize();

    return 0;
}
Пример #26
0
int main( int argc, char *argv[])
{
    int  n, myid, numprocs, i, j;
    double PI25DT = 3.141592653589793238462643;
    double mypi, pi, h, sum, x;
    double startwtime = 0.0, endwtime;
    int namelen; 
    int event1a, event1b, event2a, event2b,
        event3a, event3b, event4a, event4b;
    char processor_name[MPI_MAX_PROCESSOR_NAME];

    MPI_Init(&argc,&argv);

        MPI_Pcontrol( 0 );

    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);

    MPI_Get_processor_name(processor_name,&namelen);
    fprintf(stderr,"Process %d running on %s\n", myid, processor_name);

    /*
        MPE_Init_log() & MPE_Finish_log() are NOT needed when
        liblmpe.a is linked with this program.  In that case,
        MPI_Init() would have called MPE_Init_log() already.
    */
/*
    MPE_Init_log();
*/

    /*  Get event ID from MPE, user should NOT assign event ID  */
    event1a = MPE_Log_get_event_number(); 
    event1b = MPE_Log_get_event_number(); 
    event2a = MPE_Log_get_event_number(); 
    event2b = MPE_Log_get_event_number(); 
    event3a = MPE_Log_get_event_number(); 
    event3b = MPE_Log_get_event_number(); 
    event4a = MPE_Log_get_event_number(); 
    event4b = MPE_Log_get_event_number(); 

    if (myid == 0) {
        MPE_Describe_state(event1a, event1b, "Broadcast", "red");
        MPE_Describe_state(event2a, event2b, "Compute",   "blue");
        MPE_Describe_state(event3a, event3b, "Reduce",    "green");
        MPE_Describe_state(event4a, event4b, "Sync",      "orange");
    }

    if (myid == 0) {
        n = 1000000;
        startwtime = MPI_Wtime();
    }
    MPI_Barrier(MPI_COMM_WORLD);

        MPI_Pcontrol( 1 );
        /*
    MPE_Start_log();
        */

    for (j = 0; j < 5; j++) {
        MPE_Log_event(event1a, 0, NULL);
        MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPE_Log_event(event1b, 0, NULL);

        MPE_Log_event(event4a, 0, NULL);
        MPI_Barrier(MPI_COMM_WORLD);
        MPE_Log_event(event4b, 0, NULL);

        MPE_Log_event(event2a, 0, NULL);
        h   = 1.0 / (double) n;
        sum = 0.0;
        for (i = myid + 1; i <= n; i += numprocs) {
            x = h * ((double)i - 0.5);
            sum += f(x);
        }
        mypi = h * sum;
        MPE_Log_event(event2b, 0, NULL);

        MPE_Log_event(event3a, 0, NULL);
        MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
        MPE_Log_event(event3b, 0, NULL);
    }
/*
    MPE_Finish_log("cpilog");
*/

    if (myid == 0) {
        endwtime = MPI_Wtime();
        printf("pi is approximately %.16f, Error is %.16f\n",
               pi, fabs(pi - PI25DT));
        printf("wall clock time = %f\n", endwtime-startwtime);
    }
    MPI_Finalize();
    return(0);
}
Пример #27
0
int main(int argc, char ** argv) {
   
  int             Num_procs;         // number of ranks 
  int             Num_procsx, 
                  Num_procsy;        // number of ranks in each coord direction      
  int             args_used = 1;     // keeps track of # consumed arguments
  int             my_ID;             // MPI rank
  int             my_IDx, my_IDy;    // coordinates of rank in rank grid                  
  int             root = 0;          // master rank
  uint64_t        L;                 // dimension of grid in cells
  uint64_t        iterations ;       // total number of simulation steps
  uint64_t        n;                 // total number of particles requested in the simulation
  uint64_t        actual_particles,  // actual number of particles owned by my rank
                  total_particles;   // total number of generated particles
  char            *init_mode;        // particle initialization mode (char)
  double          rho ;              // attenuation factor for geometric particle distribution
  uint64_t        k, m;              // determine initial horizontal and vertical velocity of 
                                 // particles-- (2*k)+1 cells per time step 
  double          *grid;             // the grid is represented as an array of charges
  uint64_t        iter, i;           // dummies
  double          fx, fy, ax, ay;    // particle forces and accelerations
  int             error=0;           // used for graceful exit after error
  uint64_t        correctness=0;     // boolean indicating correct particle displacements
  uint64_t        istart, jstart, iend, jend, particles_size, particles_count;
  bbox_t          grid_patch,        // whole grid
                  init_patch,        // subset of grid used for localized initialization
                  my_tile;           // subset of grid owner by my rank
  particle_t      *particles, *p;    // array of particles owned by my rank
  uint64_t        *cur_counts;       //
  uint64_t        ptr_my;            //
  uint64_t        owner;             // owner (rank) of a particular particle
  double          pic_time, local_pic_time, avg_time;
  uint64_t        my_checksum = 0, tot_checksum = 0, correctness_checksum = 0;
  uint64_t        width, height;     // minimum dimensions of grid tile owned by my rank
  int             particle_mode;     // type of initialization
  double          alpha, beta;       // negative slope and offset for linear initialization
  int             nbr[8];            // topological neighbor ranks
  int             icrit, jcrit;      // global grid indices where tile size drops to minimum
  find_owner_type find_owner;
  int             ileftover, jleftover;// excess grid points divided among "fat" tiles
  uint64_t        to_send[8], to_recv[8];// 
  int             procsize;          // number of ranks per OS process                      
   
  MPI_Status  status[16];
  MPI_Request requests[16];
   
  /* Initialize the MPI environment */
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  /* FIXME: This can be further improved */
  /* Create MPI data type for particle_t */
  MPI_Datatype PARTICLE;
  MPI_Type_contiguous(sizeof(particle_t)/sizeof(double), MPI_DOUBLE, &PARTICLE);
  MPI_Type_commit( &PARTICLE );

  if (my_ID==root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("FG_MPI Particle-in-Cell execution on 2D grid\n");

    if (argc<6) {
      printf("Usage: %s <#simulation steps> <grid size> <#particles> <k (particle charge semi-increment)> ", argv[0]);
      printf("<m (vertical particle velocity)>\n");
      printf("          <init mode> <init parameters>]\n");
      printf("   init mode \"GEOMETRIC\"  parameters: <attenuation factor>\n");
      printf("             \"SINUSOIDAL\" parameters: none\n");
      printf("             \"LINEAR\"     parameters: <negative slope> <constant offset>\n");
      printf("             \"PATCH\"      parameters: <xleft> <xright>  <ybottom> <ytop>\n");
      error = 1;
      goto ENDOFTESTS;
    }

    iterations = atol(*++argv);  args_used++;   
    if (iterations<1) {
      printf("ERROR: Number of time steps must be positive: %llu\n", iterations);
      error = 1;
      goto ENDOFTESTS;  
    }

    L = atol(*++argv);  args_used++;   
    if (L<1 || L%2) {
      printf("ERROR: Number of grid cells must be positive and even: %llu\n", L);
      error = 1;
      goto ENDOFTESTS;  
    }
    n = atol(*++argv);  args_used++;   
    if (n<1) {
      printf("ERROR: Number of particles must be positive: %llu\n", n);
      error = 1;
      goto ENDOFTESTS;  
    }

    particle_mode  = UNDEFINED;
    k = atoi(*++argv);   args_used++; 
    if (k<0) {
      printf("ERROR: Particle semi-charge must be non-negative: %llu\n", k);
      error = 1;
      goto ENDOFTESTS;  
    }
    m = atoi(*++argv);   args_used++; 
    init_mode = *++argv; args_used++;  

    ENDOFTESTS:;  

  } // done with standard initialization parameters
  bail_out(error);

  MPI_Bcast(&iterations, 1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&L,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&n,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&k,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&m,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);

  grid_patch = (bbox_t){0, L+1, 0, L+1};
   
  if (my_ID==root) { // process initialization parameters
    /* Initialize particles with geometric distribution */
    if (strcmp(init_mode, "GEOMETRIC") == 0) {
      if (argc<args_used+1) {
        printf("ERROR: Not enough arguments for GEOMETRIC\n"); 
        error = 1;
        goto ENDOFTESTS2;  
      }
      particle_mode = GEOMETRIC;
      rho = atof(*++argv);   args_used++;
    }
   
    /* Initialize with a sinusoidal particle distribution (single period) */
    if (strcmp(init_mode, "SINUSOIDAL") == 0) {
      particle_mode = SINUSOIDAL;
    }
   
    /* Initialize particles with linear distribution */
    /* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/
    if (strcmp(init_mode, "LINEAR") == 0) {
      if (argc<args_used+2) {
        printf("ERROR: Not enough arguments for LINEAR initialization\n");
        error = 1;
        goto ENDOFTESTS2;  
        exit(EXIT_FAILURE);
      }
      particle_mode = LINEAR;
      alpha = atof(*++argv); args_used++; 
      beta  = atof(*++argv); args_used++;
      if (beta <0 || beta<alpha) {
        printf("ERROR: linear profile gives negative particle density\n");
        error = 1;
        goto ENDOFTESTS2;  
      }
    }
   
    /* Initialize particles uniformly within a "patch" */
    if (strcmp(init_mode, "PATCH") == 0) {
      if (argc<args_used+4) {
        printf("ERROR: Not enough arguments for PATCH initialization\n");
        error = 1;
        goto ENDOFTESTS2;  
      }
      particle_mode = PATCH;
      init_patch.left   = atoi(*++argv); args_used++;
      init_patch.right  = atoi(*++argv); args_used++;
      init_patch.bottom = atoi(*++argv); args_used++;
      init_patch.top    = atoi(*++argv); args_used++;
      if (bad_patch(&init_patch, &grid_patch)) {
        printf("ERROR: inconsistent initial patch\n");
        error = 1;
        goto ENDOFTESTS2;  
      }
    }
    ENDOFTESTS2:;  

  } //done with processing initializaton parameters, now broadcast

  bail_out(error);

  MPI_Bcast(&particle_mode, 1, MPI_INT, root, MPI_COMM_WORLD);
  switch (particle_mode) {
  case GEOMETRIC:  MPI_Bcast(&rho,               1, MPI_DOUBLE,  root, MPI_COMM_WORLD);
                   break;
  case SINUSOIDAL: break;
  case LINEAR:     MPI_Bcast(&alpha,             1, MPI_DOUBLE,  root, MPI_COMM_WORLD);
                   MPI_Bcast(&beta,              1, MPI_DOUBLE,  root, MPI_COMM_WORLD);
                   break;
  case PATCH:      MPI_Bcast(&init_patch.left,   1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   MPI_Bcast(&init_patch.right,  1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   MPI_Bcast(&init_patch.bottom, 1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   MPI_Bcast(&init_patch.top,    1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   break;
  }
   
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now                        */

  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }      
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;

  if (my_ID == root) {
    MPIX_Get_collocated_size(&procsize);
    printf("Number of ranks                    = %llu\n", Num_procs);
    printf("Number of ranks/process            = %d\n", procsize);
    printf("Load balancing                     = None\n");
    printf("Grid size                          = %llu\n", L);
    printf("Tiles in x/y-direction             = %d/%d\n", Num_procsx, Num_procsy);
    printf("Number of particles requested      = %llu\n", n); 
    printf("Number of time steps               = %llu\n", iterations);
    printf("Initialization mode                = %s\n",   init_mode);
    switch(particle_mode) {
    case GEOMETRIC: printf("  Attenuation factor               = %lf\n", rho);    break;
    case SINUSOIDAL:                                                              break;
    case LINEAR:    printf("  Negative slope                   = %lf\n", alpha);
                    printf("  Offset                           = %lf\n", beta);   break;
    case PATCH:     printf("  Bounding box                     = %llu, %llu, %llu, %llu\n",
                           init_patch.left, init_patch.right, 
                           init_patch.bottom, init_patch.top);                    break;
    default:        printf("ERROR: Unsupported particle initializating mode\n");
                    error = 1;
    }
    printf("Particle charge semi-increment (k) = %llu\n", k);
    printf("Vertical velocity              (m) = %llu\n", m);
  }
  bail_out(error);

  /* The processes collectively create the underlying grid following a 2D block decomposition;
     unlike in the stencil code, successive blocks share an overlap vertex                   */
  width = L/Num_procsx;
  if (width < 2*k) {
    if (my_ID==0) printf("k-value too large: %llu, must be no greater than %llu\n", k, width/2);
    bail_out(1);
  }
  ileftover = L%Num_procsx;
  if (my_IDx<ileftover) {
    istart = (width+1) * my_IDx; 
    iend = istart + width + 1;
  }
  else {
    istart = (width+1) * ileftover + width * (my_IDx-ileftover);
    iend = istart + width;
  }
  icrit = (width+1) * ileftover;

  height = L/Num_procsy;
  if (height < m) {
    if (my_ID==0) printf("m-value too large: %llu, must be no greater than %llu\n", m, height);
    bail_out(1);
  }

  jleftover = L%Num_procsy;
  if (my_IDy<jleftover) {
    jstart = (height+1) * my_IDy; 
    jend = jstart + height + 1;
  }
  else {
    jstart = (height+1) * jleftover + height * (my_IDy-jleftover);
    jend = jstart + height;
  }
  jcrit = (height+1) * jleftover;

  /* if the problem can be divided evenly among ranks, use the simple owner finding function */
  if (icrit==0 && jcrit==0) {
    find_owner = find_owner_simple;
    if (my_ID==root) printf("Rank search mode used              = simple\n");
  }
  else {
    find_owner = find_owner_general;
    if (my_ID==root) printf("Rank search mode used              = general\n");
  }

  /* define bounding box for tile owned by my rank for convenience */
  my_tile = (bbox_t){istart,iend,jstart,jend};

  /* Find neighbors. Indexing: left=0, right=1, bottom=2, top=3, 
                               bottom-left=4, bottom-right=5, top-left=6, top-right=7 */

  /* These are IDs in the global communicator */
  nbr[0] = (my_IDx == 0           ) ? my_ID  + Num_procsx - 1         : my_ID  - 1;
  nbr[1] = (my_IDx == Num_procsx-1) ? my_ID  - Num_procsx + 1         : my_ID  + 1;
  nbr[2] = (my_IDy == Num_procsy-1) ? my_ID  + Num_procsx - Num_procs : my_ID  + Num_procsx;
  nbr[3] = (my_IDy == 0           ) ? my_ID  - Num_procsx + Num_procs : my_ID  - Num_procsx;
  nbr[4] = (my_IDy == Num_procsy-1) ? nbr[0] + Num_procsx - Num_procs : nbr[0] + Num_procsx;
  nbr[5] = (my_IDy == Num_procsy-1) ? nbr[1] + Num_procsx - Num_procs : nbr[1] + Num_procsx;
  nbr[6] = (my_IDy == 0           ) ? nbr[0] - Num_procsx + Num_procs : nbr[0] - Num_procsx;
  nbr[7] = (my_IDy == 0           ) ? nbr[1] - Num_procsx + Num_procs : nbr[1] - Num_procsx;

  grid = initializeGrid(my_tile);

  switch(particle_mode){
  case GEOMETRIC: 
    particles = initializeGeometric(n, L, rho, my_tile, k, m,
                                     &particles_count, &particles_size);
    break;
  case LINEAR:
    particles = initializeLinear(n, L, alpha, beta, my_tile, k, m, 
                                     &particles_count, &particles_size);
    break;
  case SINUSOIDAL:
    particles = initializeSinusoidal(n, L, my_tile, k, m, 
                                     &particles_count, &particles_size);
    break;
  case PATCH:
    particles = initializePatch(n, L, init_patch, my_tile, k, m,
                                     &particles_count, &particles_size);
  }

  if (!particles) {
    printf("ERROR: Rank %d could not allocate space for %llu particles\n", my_ID, particles_size);
    error=1;
  }
  bail_out(error);

#if VERBOSE
  for (i=0; i<Num_procs; i++) {
    MPI_Barrier(MPI_COMM_WORLD);
    if (i == my_ID)  printf("Rank %d has %llu particles\n", my_ID, particles_count);
  }
#endif
  if (my_ID==root) {
    MPI_Reduce(&particles_count, &total_particles, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);
    printf("Number of particles placed         = %llu\n", total_particles);
  }
  else {
    MPI_Reduce(&particles_count, &total_particles, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);
  }

  /* Allocate space for communication buffers. Adjust appropriately as the simulation proceeds */
  
  uint64_t sendbuf_size[8], recvbuf_size[8];
  particle_t *sendbuf[8], *recvbuf[8];
  error=0;
  for (i=0; i<8; i++) {
    sendbuf_size[i] = MAX(1,n/(MEMORYSLACK*Num_procs));
    recvbuf_size[i] = MAX(1,n/(MEMORYSLACK*Num_procs));
    sendbuf[i] = (particle_t*) prk_malloc(sendbuf_size[i] * sizeof(particle_t));
    recvbuf[i] = (particle_t*) prk_malloc(recvbuf_size[i] * sizeof(particle_t));
    if (!sendbuf[i] || !recvbuf[i]) error++;
  }
  if (error) printf("Rank %d could not allocate communication buffers\n", my_ID);
  bail_out(error);
    
  /* Run the simulation */
  for (iter=0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_pic_time = wtime();
    }

    ptr_my = 0;
    for (i=0; i<8; i++) to_send[i]=0;
      
    /* Process own particles */
    p = particles;

    for (i=0; i < particles_count; i++) {
      fx = 0.0;
      fy = 0.0;
      computeTotalForce(p[i], my_tile, grid, &fx, &fy);

      ax = fx * MASS_INV;
      ay = fy * MASS_INV;

      /* Update particle positions, taking into account periodic boundaries */
      p[i].x = fmod(p[i].x + p[i].v_x*DT + 0.5*ax*DT*DT + L, L);
      p[i].y = fmod(p[i].y + p[i].v_y*DT + 0.5*ay*DT*DT + L, L);

      /* Update velocities */
      p[i].v_x += ax * DT;
      p[i].v_y += ay * DT;

      /* Check if particle stayed in same subdomain or moved to another */
      owner = find_owner(p[i], width, height, Num_procsx, icrit, jcrit, ileftover, jleftover);
      if (owner==my_ID) {
        add_particle_to_buffer(p[i], &p, &ptr_my, &particles_size);
      /* Add particle to the appropriate communication buffer */
      } else if (owner == nbr[0]) {
        add_particle_to_buffer(p[i], &sendbuf[0], &to_send[0], &sendbuf_size[0]);
      } else if (owner == nbr[1]) {
        add_particle_to_buffer(p[i], &sendbuf[1], &to_send[1], &sendbuf_size[1]);
      } else if (owner == nbr[2]) {
        add_particle_to_buffer(p[i], &sendbuf[2], &to_send[2], &sendbuf_size[2]);
      } else if (owner == nbr[3]) {
        add_particle_to_buffer(p[i], &sendbuf[3], &to_send[3], &sendbuf_size[3]);
      } else if (owner == nbr[4]) {
        add_particle_to_buffer(p[i], &sendbuf[4], &to_send[4], &sendbuf_size[4]);
      } else if (owner == nbr[5]) {
        add_particle_to_buffer(p[i], &sendbuf[5], &to_send[5], &sendbuf_size[5]);
      } else if (owner == nbr[6]) {
        add_particle_to_buffer(p[i], &sendbuf[6], &to_send[6], &sendbuf_size[6]);
      } else if (owner == nbr[7]) {
        add_particle_to_buffer(p[i], &sendbuf[7], &to_send[7], &sendbuf_size[7]);
      } else {
        printf("Could not find neighbor owner of particle %llu in tile %llu\n", 
        i, owner);
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      }
    }

    /* Communicate the number of particles to be sent/received */
    for (i=0; i<8; i++) {
      MPI_Isend(&to_send[i], 1, MPI_UINT64_T, nbr[i], 0, MPI_COMM_WORLD, &requests[i]);
      MPI_Irecv(&to_recv[i], 1, MPI_UINT64_T, nbr[i], 0, MPI_COMM_WORLD, &requests[8+i]);
    }
    MPI_Waitall(16, requests, status);
      
    /* Resize receive buffers if need be */
    for (i=0; i<8; i++) {
      resize_buffer(&recvbuf[i], &recvbuf_size[i], to_recv[i]);
    }
      
    /* Communicate the particles */
    for (i=0; i<8; i++) {
      MPI_Isend(sendbuf[i], to_send[i], PARTICLE, nbr[i], 0, MPI_COMM_WORLD, &requests[i]);
      MPI_Irecv(recvbuf[i], to_recv[i], PARTICLE, nbr[i], 0, MPI_COMM_WORLD, &requests[8+i]);
    }
    MPI_Waitall(16, requests, status);
     
    /* Attach received particles to particles buffer */
    for (i=0; i<4; i++) {
      attach_received_particles(&particles, &ptr_my, &particles_size, recvbuf[2*i], to_recv[2*i], 
                                recvbuf[2*i+1], to_recv[2*i+1]);
    }    
    particles_count = ptr_my;
  }
   
  local_pic_time = MPI_Wtime() - local_pic_time;
  MPI_Reduce(&local_pic_time, &pic_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);
   
  /* Run the verification test */
  /* First verify own particles */
  for (i=0; i < particles_count; i++) {
    correctness += verifyParticle(particles[i], (double)L, iterations);
    my_checksum += (uint64_t)particles[i].ID;
  }

  /* Gather total checksum of particles */
  MPI_Reduce(&my_checksum, &tot_checksum, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);
  /* Gather total checksum of correctness flags */
  MPI_Reduce(&correctness, &correctness_checksum, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);

  if ( my_ID == root) {
    if (correctness_checksum != total_particles ) {
      printf("ERROR: there are %llu miscalculated locations\n", total_particles-correctness_checksum);
    }
    else {
      if (tot_checksum != (total_particles*(total_particles+1))/2) {
        printf("ERROR: Particle checksum incorrect\n");
      }
      else {
        avg_time = total_particles*iterations/pic_time;
        printf("Solution validates\n");
        printf("Rate (Mparticles_moved/s): %lf\n", 1.0e-6*avg_time);
      }
    }
  }
   
#if VERBOSE
  for (i=0; i<Num_procs; i++) {
    MPI_Barrier(MPI_COMM_WORLD);
    if (i == my_ID)  printf("Rank %d has %llu particles\n", my_ID, particles_count);
  }
#endif

  MPI_Finalize();
   
  return 0;
}
Пример #28
0
/*
 * This test looks at the handling of char and types that  are not required 
 * integers (e.g., long long).  MPICH allows
 * these as well.  A strict MPI test should not include this test.
 *
 * The rule on max loc is that if there is a tie in the value, the minimum
 * rank is used (see 4.9.3 in the MPI-1 standard)
 */
int main( int argc, char *argv[] )
{
    int errs = 0;
    int rank, size;
    MPI_Comm      comm;

    MTest_Init( &argc, &argv );

    comm = MPI_COMM_WORLD;

    MPI_Comm_rank( comm, &rank );
    MPI_Comm_size( comm, &size );

    /* 2 int */
    {
	struct twoint { int val; int loc; } cinbuf[3], coutbuf[3];
 	
	cinbuf[0].val = 1;
	cinbuf[0].loc = rank;
	cinbuf[1].val = 0;
	cinbuf[1].loc = rank;
	cinbuf[2].val = rank;
	cinbuf[2].loc = rank;
	
	coutbuf[0].val = 0;
	coutbuf[0].loc = -1;
	coutbuf[1].val = 1;
	coutbuf[1].loc = -1;
	coutbuf[2].val = 1;
	coutbuf[2].loc = -1;
	MPI_Reduce( cinbuf, coutbuf, 3, MPI_2INT, MPI_MAXLOC, 0, comm );
	if (rank == 0) {
	    if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) {
		errs++;
		fprintf( stderr, "2int MAXLOC(1) test failed\n" );
	    }
	    if (coutbuf[1].val != 0) {
		errs++;
		fprintf( stderr, "2int MAXLOC(0) test failed, value = %d, should be zero\n", coutbuf[1].val );
	    }
	    if (coutbuf[1].loc != 0) {
		errs++;
		fprintf( stderr, "2int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc );
	    }
	    if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) {
		errs++;
		fprintf( stderr, "2int MAXLOC(>) test failed\n" );
	    }
	}
    }

    /* float int */
    {
	struct floatint { float val; int loc; } cinbuf[3], coutbuf[3];
 	
	cinbuf[0].val = 1;
	cinbuf[0].loc = rank;
	cinbuf[1].val = 0;
	cinbuf[1].loc = rank;
	cinbuf[2].val = (float)rank;
	cinbuf[2].loc = rank;
	
	coutbuf[0].val = 0;
	coutbuf[0].loc = -1;
	coutbuf[1].val = 1;
	coutbuf[1].loc = -1;
	coutbuf[2].val = 1;
	coutbuf[2].loc = -1;
	MPI_Reduce( cinbuf, coutbuf, 3, MPI_FLOAT_INT, MPI_MAXLOC, 0, comm );
	if (rank == 0) {
	    if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) {
		errs++;
		fprintf( stderr, "float-int MAXLOC(1) test failed\n" );
	    }
	    if (coutbuf[1].val != 0) {
		errs++;
		fprintf( stderr, "float-int MAXLOC(0) test failed, value = %f, should be zero\n", coutbuf[1].val );
	    }
	    if (coutbuf[1].loc != 0) {
		errs++;
		fprintf( stderr, "float-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc );
	    }
	    if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) {
		errs++;
		fprintf( stderr, "float-int MAXLOC(>) test failed\n" );
	    }
	}
    }
    
    /* long int */
    {
	struct longint { long val; int loc; } cinbuf[3], coutbuf[3];
 	
	cinbuf[0].val = 1;
	cinbuf[0].loc = rank;
	cinbuf[1].val = 0;
	cinbuf[1].loc = rank;
	cinbuf[2].val = rank;
	cinbuf[2].loc = rank;
	
	coutbuf[0].val = 0;
	coutbuf[0].loc = -1;
	coutbuf[1].val = 1;
	coutbuf[1].loc = -1;
	coutbuf[2].val = 1;
	coutbuf[2].loc = -1;
	MPI_Reduce( cinbuf, coutbuf, 3, MPI_LONG_INT, MPI_MAXLOC, 0, comm );
	if (rank == 0) {
	    if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) {
		errs++;
		fprintf( stderr, "long-int MAXLOC(1) test failed\n" );
	    }
	    if (coutbuf[1].val != 0) {
		errs++;
		fprintf( stderr, "long-int MAXLOC(0) test failed, value = %ld, should be zero\n", coutbuf[1].val );
	    }
	    if (coutbuf[1].loc != 0) {
		errs++;
		fprintf( stderr, "long-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc );
	    }
	    if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) {
		errs++;
		fprintf( stderr, "long-int MAXLOC(>) test failed\n" );
	    }
	}
    }

    /* short int */
    {
	struct shortint { short val; int loc; } cinbuf[3], coutbuf[3];
 	
	cinbuf[0].val = 1;
	cinbuf[0].loc = rank;
	cinbuf[1].val = 0;
	cinbuf[1].loc = rank;
	cinbuf[2].val = rank;
	cinbuf[2].loc = rank;
	
	coutbuf[0].val = 0;
	coutbuf[0].loc = -1;
	coutbuf[1].val = 1;
	coutbuf[1].loc = -1;
	coutbuf[2].val = 1;
	coutbuf[2].loc = -1;
	MPI_Reduce( cinbuf, coutbuf, 3, MPI_SHORT_INT, MPI_MAXLOC, 0, comm );
	if (rank == 0) {
	    if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) {
		errs++;
		fprintf( stderr, "short-int MAXLOC(1) test failed\n" );
	    }
	    if (coutbuf[1].val != 0) {
		errs++;
		fprintf( stderr, "short-int MAXLOC(0) test failed, value = %d, should be zero\n", coutbuf[1].val );
	    }
	    if (coutbuf[1].loc != 0) {
		errs++;
		fprintf( stderr, "short-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc );
	    }
	    if (coutbuf[2].val != size-1) {
		errs++;
		fprintf( stderr, "short-int MAXLOC(>) test failed, value = %d, should be %d\n", coutbuf[2].val, size-1 );
	    }
	    if (coutbuf[2].loc != size -1) {
		errs++;
		fprintf( stderr, "short-int MAXLOC(>) test failed, location of max = %d, should be %d\n", coutbuf[2].loc, size-1 );
	    }
	}
    }
    
    /* double int */
    {
	struct doubleint { double val; int loc; } cinbuf[3], coutbuf[3];
 	
	cinbuf[0].val = 1;
	cinbuf[0].loc = rank;
	cinbuf[1].val = 0;
	cinbuf[1].loc = rank;
	cinbuf[2].val = rank;
	cinbuf[2].loc = rank;
	
	coutbuf[0].val = 0;
	coutbuf[0].loc = -1;
	coutbuf[1].val = 1;
	coutbuf[1].loc = -1;
	coutbuf[2].val = 1;
	coutbuf[2].loc = -1;
	MPI_Reduce( cinbuf, coutbuf, 3, MPI_DOUBLE_INT, MPI_MAXLOC, 0, comm );
	if (rank == 0) {
	    if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) {
		errs++;
		fprintf( stderr, "double-int MAXLOC(1) test failed\n" );
	    }
	    if (coutbuf[1].val != 0) {
		errs++;
		fprintf( stderr, "double-int MAXLOC(0) test failed, value = %lf, should be zero\n", coutbuf[1].val );
	    }
	    if (coutbuf[1].loc != 0) {
		errs++;
		fprintf( stderr, "double-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc );
	    }
	    if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) {
		errs++;
		fprintf( stderr, "double-int MAXLOC(>) test failed\n" );
	    }
	}
    }
    
#ifdef HAVE_LONG_DOUBLE
    /* long double int */
    {
	struct longdoubleint { long double val; int loc; } cinbuf[3], coutbuf[3];

        /* avoid valgrind warnings about padding bytes in the long double */
        memset(&cinbuf[0], 0, sizeof(cinbuf));
        memset(&coutbuf[0], 0, sizeof(coutbuf));

	cinbuf[0].val = 1;
	cinbuf[0].loc = rank;
	cinbuf[1].val = 0;
	cinbuf[1].loc = rank;
	cinbuf[2].val = rank;
	cinbuf[2].loc = rank;
	
	coutbuf[0].val = 0;
	coutbuf[0].loc = -1;
	coutbuf[1].val = 1;
	coutbuf[1].loc = -1;
	coutbuf[2].val = 1;
	coutbuf[2].loc = -1;
	if (MPI_LONG_DOUBLE != MPI_DATATYPE_NULL) {
	    MPI_Reduce( cinbuf, coutbuf, 3, MPI_LONG_DOUBLE_INT, MPI_MAXLOC, 
			0, comm );
	    if (rank == 0) {
		if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) {
		    errs++;
		    fprintf( stderr, "long double-int MAXLOC(1) test failed\n" );
		}
		if (coutbuf[1].val != 0) {
		    errs++;
		    fprintf( stderr, "long double-int MAXLOC(0) test failed, value = %lf, should be zero\n", (double)coutbuf[1].val );
		}
		if (coutbuf[1].loc != 0) {
		    errs++;
		    fprintf( stderr, "long double-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc );
		}
		if (coutbuf[2].val != size-1) {
		    errs++;
		    fprintf( stderr, "long double-int MAXLOC(>) test failed, value = %lf, should be %d\n", (double)coutbuf[2].val, size-1 );
		}
		if (coutbuf[2].loc != size-1) {
		    errs++;
		    fprintf( stderr, "long double-int MAXLOC(>) test failed, location of max = %d, should be %d\n", coutbuf[2].loc, size-1 );
		}
	    }
	}
    }
#endif

    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
Пример #29
0
/*------------------------------------------------*/
int main (int argc, char **argv)
{
  int cols, rows, iter, particles, x, y;
  int *pic;
  PartStr *p, *changes, *totalChanges;
  int rank, num, i, numChanges, numTotalChanges;
  int *changesPerNode, *buffDispl;
  MPI_Init (&argc, &argv);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Comm_size (MPI_COMM_WORLD, &num);
  
  if (argc < 2)			// use default values if user does not specify anything
    {
      cols = PIC_SIZE + 2;
      rows = PIC_SIZE + 2;
      iter = MAX_ITER;
      particles = PARTICLES;
    }
  else
    {
      cols = atoi (argv[1]) + 2;
      rows = atoi (argv[2]) + 2;
      particles = atoi (argv[3]);
      iter = atoi (argv[4]);
    }

  // initialize the random number generator
  srand(rank);
  // srand(time(0)); // this should be used instead if the program runs on multiple hosts

    
  int particlesPerNode = particles / num;
  if (rank == num - 1)
    particlesPerNode = particles - particlesPerNode * (num - 1);	// in case particles cannot be split evenly
// printf("%i has %i\n", rank, particlesPerNode);
  pic = (int *) malloc (sizeof (int) * cols * rows);
  p = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode);
  changes = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode);
  totalChanges = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode);
  changesPerNode = (int *) malloc (sizeof (int) * num);
  buffDispl = (int *) malloc (sizeof (int) * num);
  assert (pic != 0 && p != 0 && changes != 0 && totalChanges != 0
	  && changesPerNode != 0);



  // MPI user type declaration
  int lengths[2] = { 1, 1 };
  MPI_Datatype types[2] = { MPI_INT, MPI_INT };
  MPI_Aint add1, add2;
  MPI_Aint displ[2];
  MPI_Datatype Point;

  MPI_Address (p, &add1);
  MPI_Address (&(p[0].y), &add2);
  displ[0] = 0;
  displ[1] = add2 - add1;

  MPI_Type_struct (2, lengths, displ, types, &Point);
  MPI_Type_commit (&Point);


  dla_init_plist (pic, rows, cols, p, particlesPerNode, 1);
  while (--iter)
    {
      dla_evolve_plist (pic, rows, cols, p, &particlesPerNode, changes, &numChanges);      
//       printf("%i changed %i on iter %i : ",rank, numChanges, iter);
//       for(i=0;i<numChanges;i++) printf("(%i, %i) ", changes[i].x, changes[i].y);
//       printf("\n");
      
      //exchange information with other nodes
      MPI_Allgather (&numChanges, 1, MPI_INT, changesPerNode, 1, MPI_INT, MPI_COMM_WORLD);
      //calculate offsets
      numTotalChanges = 0;
      for (i = 0; i < num; i++)
	{
	  buffDispl[i] = numTotalChanges;
	  numTotalChanges += changesPerNode[i];
	}
//        if(rank==0)
//        {
//  	for(i=0;i<num;i++)
//  	  printf("%i tries to send %i\n",i,changesPerNode[i]);
//  	printf("-----------\n");
//        }
      if(numTotalChanges >0)
      {
      MPI_Allgatherv (changes, numChanges, Point,
		      totalChanges, changesPerNode, buffDispl, Point,
		      MPI_COMM_WORLD);
      apply_changes (pic, rows, cols, totalChanges, numTotalChanges);

	
//       if(rank==0)
//       {
//         printf("Total changes %i : ", numTotalChanges);
//         for(i=0;i<numTotalChanges;i++) printf("(%i, %i) ", totalChanges[i].x, totalChanges[i].y);
// 	
//         printf("\n");
// 	printf("-----------\n");
//       }
      }
    }

  /* Print to stdout a PBM picture of the simulation space */
  if (rank == 0)
    {
      printf ("P1\n%i %i\n", cols - 2, rows - 2);

      for (y = 1; y < rows - 1; y++)
	{
	  for (x = 1; x < cols - 1; x++)
	    {
	      if (pic[y * cols + x] < 0)
		printf ("1 ");
	      else
		printf ("0 ");
	    }
	  printf ("\n");
	}
    }
    
  MPI_Reduce(&particlesPerNode, &particles, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);  
  if(rank==0) fprintf(stderr, "Remaining particles %i\n", particles);
    
  free (pic);
  free (p);
  free (changes);
  free (changesPerNode);
  free (buffDispl);
  MPI_Finalize ();
  return 0;
}
Пример #30
0
void
PStatPrint(superlu_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid)
{
    double  *utime = stat->utime;
    flops_t *ops = stat->ops;
    int_t   iam = grid->iam;
    flops_t flopcnt, factflop, solveflop;

    if ( options->PrintStat == NO ) return;

    if ( !iam && options->Fact != FACTORED ) {
	printf("**************************************************\n");
	printf("**** Time (seconds) ****\n");

        if ( options->Equil != NO )
	    printf("\tEQUIL time         %8.2f\n", utime[EQUIL]);
	if ( options->RowPerm != NOROWPERM )
	    printf("\tROWPERM time       %8.2f\n", utime[ROWPERM]);
	if ( options->ColPerm != NATURAL )
	    printf("\tCOLPERM time       %8.2f\n", utime[COLPERM]);
        printf("\tSYMBFACT time      %8.2f\n", utime[SYMBFAC]);
	printf("\tDISTRIBUTE time    %8.2f\n", utime[DIST]);

    }

    MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
	       0, grid->comm);
    factflop = flopcnt;
    if ( !iam && options->Fact != FACTORED ) {
	printf("\tFACTOR time        %8.2f\n", utime[FACT]);
	if ( utime[FACT] != 0.0 )
	    printf("\tFactor flops\t%e\tMflops \t%8.2f\n",
		   flopcnt,
		   flopcnt*1e-6/utime[FACT]);
    }
	
    MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 
	       0, grid->comm);
    solveflop = flopcnt;
    if ( !iam ) {
	printf("\tSOLVE time         %8.2f\n", utime[SOLVE]);
	if ( utime[SOLVE] != 0.0 )
	    printf("\tSolve flops\t%e\tMflops \t%8.2f\n",
		   flopcnt,
		   flopcnt*1e-6/utime[SOLVE]);
	if ( options->IterRefine != NOREFINE ) {
	    printf("\tREFINEMENT time    %8.2f\tSteps%8d\n\n",
		   utime[REFINE], stat->RefineSteps);
	}
	printf("**************************************************\n");
    }

#if ( PROFlevel>=1 )
    fflush(stdout);
    MPI_Barrier( grid->comm );

    {
	int_t i, P = grid->nprow*grid->npcol;
	flops_t b, maxflop;
	if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n");
	for (i = 0; i < P; ++i) {
	    if ( iam == i) {
		printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]);
		fflush(stdout);
	    }
	    MPI_Barrier( grid->comm );
	}
	if ( !iam ) printf("\n.. FACT ops distribution:\n");
	for (i = 0; i < P; ++i) {
	    if ( iam == i ) {
		printf("\t\t(%d)\t%e\n", iam, ops[FACT]);
		fflush(stdout);
	    }
	    MPI_Barrier( grid->comm );
	}
	MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
	if ( !iam ) {
	    b = factflop/P/maxflop;
	    printf("\tFACT load balance: %.2f\n", b);
	}
	if ( !iam ) printf("\n.. SOLVE ops distribution:\n");
	for (i = 0; i < P; ++i) {
	    if ( iam == i ) {
		printf("\t\t%d\t%e\n", iam, ops[SOLVE]);
		fflush(stdout);
	    }
	    MPI_Barrier( grid->comm );
	}
	MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm);
	if ( !iam ) {
	    b = solveflop/P/maxflop;
	    printf("\tSOLVE load balance: %.2f\n", b);
	}
    }
#endif

/*  if ( !iam ) fflush(stdout);  CRASH THE SYSTEM pierre.  */
}