/* * This test attempts collective communication after a process in * the communicator has failed. */ int main(int argc, char **argv) { int rank, size, i, rc, errclass, toterrs, errs = 0; char rbuf[100000]; char *sendbuf; int deadprocs[1] = {1}; MPI_Group world, newgroup; MPI_Comm newcomm; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); if (size < 3) { fprintf( stderr, "Must run with at least 3 processes\n" ); MPI_Abort( MPI_COMM_WORLD, 1 ); } MPI_Comm_group(MPI_COMM_WORLD, &world); MPI_Group_excl(world, 1, deadprocs, &newgroup); MPI_Comm_create_group(MPI_COMM_WORLD, newgroup, 0, &newcomm); if (rank == 1) { exit(EXIT_FAILURE); } /* try a small send first */ sendbuf = (char *)malloc(10*size*sizeof(char)); if (rank == 0) { for (i=0;i<size;i++) { strcpy(sendbuf + i*10, "No Errors"); } } rc = MPI_Scatter(sendbuf, 10, MPI_CHAR, rbuf, 10, MPI_CHAR, 0, MPI_COMM_WORLD); #if defined (MPICH) && (MPICH_NUMVERSION >= 30100102) MPI_Error_class(rc, &errclass); if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) { fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass); errs++; } #endif /* reset the buffers and try a larger scatter */ free(sendbuf); memset(rbuf, 0, sizeof(rbuf)); sendbuf = (char *)malloc(100000*size*sizeof(char)); if (rank == 0) { for (i=0;i<size;i++) { strcpy(sendbuf + i*100000, "No Errors"); } } rc = MPI_Scatter(sendbuf, 100000, MPI_CHAR, rbuf, 100000, MPI_CHAR, 0, MPI_COMM_WORLD); #if defined (MPICH) && (MPICH_NUMVERSION >= 30100102) MPI_Error_class(rc, &errclass); if ((rc) && (errclass != MPIX_ERR_PROC_FAILED)) { fprintf(stderr, "Wrong error code (%d) returned. Expected MPIX_ERR_PROC_FAILED\n", errclass); errs++; } #endif rc = MPI_Reduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, 0, newcomm); if(rc) fprintf(stderr, "Failed to get errors from other processes\n"); if (rank == 0) { if (toterrs) { printf( " Found %d errors\n", toterrs ); } else { printf( " No Errors\n" ); } fflush(stdout); } free(sendbuf); MPI_Comm_free(&newcomm); MPI_Group_free(&newgroup); MPI_Group_free(&world); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int rank; int world_size; /* * Initialization */ int thread_support; if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) { fprintf(stderr,"MPI_Init_thread failed\n"); exit(1); } if (thread_support == MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); if (thread_support < MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI does not have thread support!\n"); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); starpu_srand48((long int)time(NULL)); parse_args(rank, argc, argv); int ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* We disable sequential consistency in this example */ starpu_data_set_default_sequential_consistency_flag(0); starpu_mpi_init(NULL, NULL, 0); STARPU_ASSERT(p*q == world_size); starpu_cublas_init(); int barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); /* * Problem Init */ init_matrix(rank); fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank, (int)(allocated_memory/(1024*1024)), (int)(allocated_memory_extra/(1024*1024)), (int)((allocated_memory+allocated_memory_extra)/(1024*1024))); display_grid(rank, nblocks); TYPE *a_r = NULL; // STARPU_PLU(display_data_content)(a_r, size); TYPE *x, *y; if (check) { x = calloc(size, sizeof(TYPE)); STARPU_ASSERT(x); y = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { unsigned ind; for (ind = 0; ind < size; ind++) x[ind] = (TYPE)starpu_drand48(); } a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks); if (rank == 0) STARPU_PLU(display_data_content)(a_r, size); // STARPU_PLU(compute_ax)(size, x, y, nblocks, rank); } barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size); /* * Report performance */ int reduce_ret; double min_timing = timing; double max_timing = timing; double sum_timing = timing; reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); if (rank == 0) { fprintf(stderr, "Computation took: %f ms\n", max_timing/1000); fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000); fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000); fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000)); unsigned n = size; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f)); } /* * Test Result Correctness */ if (check) { /* * Compute || A - LU || */ STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r); #if 0 /* * Compute || Ax - LUx || */ unsigned ind; y2 = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { for (ind = 0; ind < size; ind++) { y2[ind] = (TYPE)0.0; } } STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank); /* Compute y2 = y2 - y */ CPU_AXPY(size, -1.0, y, 1, y2, 1); TYPE err = CPU_ASUM(size, y2, 1); int max = CPU_IAMAX(size, y2, 1); fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size)); fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]); #endif } /* * Termination */ barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); starpu_cublas_shutdown(); starpu_mpi_shutdown(); starpu_shutdown(); #if 0 MPI_Finalize(); #endif return 0; }
int main(int argc, char** argv) { int rank; int size; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if(argc != 5) { fprintf(stderr, "argc %d\n", argc); fprintf(stderr, "Usage: %s <left> <right> <numSyncFiles> <tolerance>\n" "where <left> and <right> are different" "N-procs_case directories\n", argv[0]); MPI_Finalize(); return 1; } char* lpath = argv[1]; char* rpath = argv[2]; int nSyncFiles = atoi(argv[3]); double tolerance = atof(argv[4]); int ndof; int nshg; int solsize; double* solution; std::set<int>* l_timesteps = find_timesteps(lpath, nSyncFiles); std::set<int>* r_timesteps = find_timesteps(rpath, nSyncFiles); std::set<int>* timesteps_to_check = new std::set<int>; std::set_intersection(l_timesteps->begin(), l_timesteps->end(), r_timesteps->begin(), r_timesteps->end(), std::inserter(*timesteps_to_check, timesteps_to_check->begin())); delete l_timesteps; delete r_timesteps; if(rank == 0) printf("Found %d common timesteps\n", timesteps_to_check->size()); #ifdef DBGONLY read_solution(&solution, &solsize, &nshg, &ndof, size, rank, 0, numSyncFiles, "./"); printf("nshg: %d, ndof: %d\n", nshg, ndof); assert(solsize == ndof*nshg); #endif double maxerror = 0.0; double error; double gblmaxerror; for(std::set<int>::iterator i = timesteps_to_check->begin(); i!=timesteps_to_check->end();i++) { error = compare_solution(lpath, rpath, *i, size, nSyncFiles); if(error>maxerror) maxerror = error; } delete timesteps_to_check; MPI_Barrier(MPI_COMM_WORLD); MPI_Reduce(&maxerror, &gblmaxerror, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if(rank == 0) printf("Maximum difference across all timesteps: %e\n", gblmaxerror); MPI_Finalize(); return (gblmaxerror > tolerance); }
void test_P3DFFT(int *n, std::ofstream& results, int decomp, int * dims){ int nx,ny,nz,procid,nprocs,ndim; int istart[3],isize[3],iend[3]; int fstart[3],fsize[3],fend[3]; int p3dfft_mem_conf,nrep; long int Nlocal,Nglob; double factor; double l_timers[12]={0},g_timers[12]={0}; double total_time=0*MPI_Wtime(), setup_time=0; // rtime_local is timings on each process and _global is the max reduced to root // 0 is the forward FFT time, 1 is the Hadamard multiplication, 2 is the IFFT time, 3 is the sum of 0-2, and 4 is the setup time // The communication time is measured by l_timers locally on each process and then reduced to g_timers to the root. // the sum of first four elements give the comm time unsigned char op_f[4]="fft", op_b[4]="tff"; int memsize[3]; MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&procid); nx=n[0]; ny=n[1]; nz=n[2]; ndim=1; nrep=NREP; if(decomp==1){ dims[0] = 1; dims[1] = nprocs; } if(procid == 0) printf("Using processor grid %d x %d\n",dims[0],dims[1]); /* Initialize P3DFFT */ MPI_Barrier(MPI_COMM_WORLD); setup_time -= MPI_Wtime(); //Compute Setup Time. Cp3dfft_setup(dims,nx,ny,nz,MPI_Comm_c2f(MPI_COMM_WORLD),nx,ny,nz,1,memsize); setup_time += MPI_Wtime(); //Compute Setup Time. PCOUT<<"done with setup"<<std::endl; Cp3dfft_get_dims(istart,iend,isize,1); Cp3dfft_get_dims(fstart,fend,fsize,2); /* Allocate and initialize */ double *A; // Input matrix A A=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2)); //B=(double*)fftw_malloc(sizeof(double)*(memsize[0]*memsize[1]*memsize[2]*2)); /* Warmup */ Cp3dfft_ftran_r2c(A,A,op_f); Cp3dfft_ftran_r2c(A,A,op_f); MPI_Barrier(MPI_COMM_WORLD); Cset_timers(); for (int rep=0; rep<nrep; rep++){ initialize_p3dfft(A,n); MPI_Barrier(MPI_COMM_WORLD); /* Forward transform */ total_time -= MPI_Wtime(); Cp3dfft_ftran_r2c(A,A,op_f); total_time += MPI_Wtime(); MPI_Barrier(MPI_COMM_WORLD); } Cget_timers(l_timers); Cp3dfft_btran_c2r(A,A,op_b); /* Compute Error */ //PCOUT<<"Done With FFTs computing error"<<std::endl; compute_error_p3dfft(A,n); /* Gather timing statistics */ double g_total_time, g_comm_time, g_setup_time; MPI_Reduce(&total_time,&g_total_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&l_timers,&g_timers,12,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); g_total_time=g_total_time/nrep; g_comm_time=(g_timers[0]+g_timers[1]+g_timers[2]+g_timers[3])/((double) nrep); //g_total_time=g_total_time/((double)nrep); ptrdiff_t size=n[0];size*=n[1]; size*=n[2]; double gflops=2.5*size*( log2(n[2]) + log2(n[0])+ log2(n[1]) )/(g_total_time)/1e9; if(procid == 0){ std::cout.precision(4); std::cout<<"P3DFFT Size="<<n[0]<<" "<<n[1]<<" "<<n[2]<<std::endl;; std::cout<<"0= "<<g_timers[0]<<" 1= "<<g_timers[1]<<" 2= "<<g_timers[2]<<" 3= "<<g_timers[3]<<" 4= "<<g_timers[4]<<std::endl; std::cout<<"5= "<<g_timers[5]<<" 6= "<<g_timers[6]<<" 7= "<<g_timers[7]<<" 8= "<<g_timers[8]<<" 9= "<<g_timers[9]<<std::endl; std::cout<<"10= "<<g_timers[10]<<" 11= "<<g_timers[11]<<std::endl; std::cout<<"\033[1;31m"; std::cout<<"\t"<<"np"<<"\t"<<"Grid"<<"\t"<<"Total"<<'\t'<<"Comm Time"<<"\t"<<"Setup Time"<<"\t"<<"\t"<<"Reps"<<'\t'<<"GFlops"<<std::endl; std::cout<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl; std::cout<<"\033[0m\n"<<std::endl; results<<"\t"<<nprocs<<"\t"<<dims[1]<<"*"<<dims[0]<<"\t"<<g_total_time<<'\t'<<g_comm_time<<"\t"<<g_setup_time<<"\t"<<nrep<<'\t'<<gflops<<std::endl; } /* Free work space */ fftw_free(A); Cp3dfft_clean(); }
int olsr_main(int argc, char *argv[]) { int i; char log[32]; tw_opt_add(olsr_opts); tw_init(&argc, &argv); #if DEBUG sprintf( log, "olsr-log.%ld", g_tw_mynode ); olsr_event_log = fopen( log, "w+"); if( olsr_event_log == nullptr ) tw_error( TW_LOC, "Failed to Open OLSR Event Log file \n"); #endif g_tw_mapping = CUSTOM; g_tw_custom_initial_mapping = &olsr_custom_mapping; g_tw_custom_lp_global_to_local_map = &olsr_mapping_to_lp; // nlp_per_pe = OLSR_MAX_NEIGHBORS;// / tw_nnodes(); //g_tw_lookahead = SA_INTERVAL; SA_range_start = nlp_per_pe; // Increase nlp_per_pe by nlp_per_pe / OMN nlp_per_pe += nlp_per_pe / OLSR_MAX_NEIGHBORS; g_tw_events_per_pe = OLSR_MAX_NEIGHBORS / 2 * nlp_per_pe + 65536; tw_define_lps(nlp_per_pe, sizeof(olsr_msg_data)); for(i = 0; i < OLSR_END_EVENT; i++) g_olsr_event_stats[i] = 0; for (i = 0; i < SA_range_start; i++) { tw_lp_settype(i, &olsr_lps[0]); } for (i = SA_range_start; i < nlp_per_pe; i++) { tw_lp_settype(i, &olsr_lps[1]); } #if DEBUG printf("g_tw_nlp is %llu\n", g_tw_nlp); #endif tw_run(); if( g_tw_synchronization_protocol != 1 ) { MPI_Reduce( g_olsr_event_stats, g_olsr_root_event_stats, OLSR_END_EVENT, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD); } else { for (i = 0; i < OLSR_END_EVENT; i++) { g_olsr_root_event_stats[i] = g_olsr_event_stats[i]; } } if (tw_ismaster()) { for( i = 0; i < OLSR_END_EVENT; i++ ) printf("OLSR Type %s Event Count = %llu \n", event_names[i], g_olsr_root_event_stats[i]); printf("Complete.\n"); } tw_end(); return 0; }
int main( int argc, char *argv[] ) { int opt; extern char *optarg; extern int optind; int is_output_timing=0, is_print_usage = 0; int _debug=0, use_gen_file = 0, use_actsto = 0, use_normalsto=0; char *token; MPI_Offset disp, offset, file_size; MPI_Datatype etype, ftype, buftype; int errs = 0; int size, rank, i, count; char *fname = NULL; double *buf; MPI_File fh; MPI_Comm comm; MPI_Status status; int64_t nitem = 0; int fsize = 0, type_size; double stime, etime, iotime, comptime, elapsed_time; double max_iotime, max_comptime; double max, min, sum=0.0, global_sum; MPI_Init( &argc, &argv ); comm = MPI_COMM_WORLD; MPI_Comm_size( comm, &size ); MPI_Comm_rank( comm, &rank ); while ( (opt=getopt(argc,argv,"i:s:godhxt"))!= EOF) { switch (opt) { case 'i': fname = optarg; break; case 'o': is_output_timing = 1; break; case 'g': use_gen_file = 1; break; case 'd': _debug = 1; break; case 'h': is_print_usage = 1; break; case 's': token = strtok(optarg, ":"); //if (rank == 0) printf("token=%s\n", token); if(token == NULL) { if (rank == 0) printf("1: Wrong file size format!\n"); MPI_Finalize(); exit(1); } fsize = atoi(token); token = strtok(NULL, ":"); //if (rank == 0) printf("token=%s\n", token); if(token == NULL) { if (rank == 0) printf("2: Wrong file size format!\n"); MPI_Finalize(); exit(1); } if(*token != 'm' && *token != 'g') { if (rank == 0) printf("3: Wrong file size format!\n"); MPI_Finalize(); exit(1); } if (rank ==0) printf("fsize = %d (%s)\n", fsize, (*token=='m'?"MB":"GB")); if (fsize == 0) nitem = 0; else { MPI_Type_size(MPI_DOUBLE, &type_size); nitem = fsize*1024; /* KB */ nitem = nitem*1024; /* MB */ if(*token == 'g') { //if(rank == 0) printf("data in GB\n"); nitem = nitem*1024; /* GB */ } nitem = nitem/type_size; //printf("nitem=%lld\n", nitem); nitem = nitem/size; /* size means comm size */ } if (rank == 0) printf("nitem = %d\n", nitem); break; case 'x': use_actsto = 1; break; case 't': use_normalsto = 1; break; default: is_print_usage = 1; break; } } if (fname == NULL || is_print_usage == 1 || nitem == 0) { if (rank == 0) usage(argv[0]); MPI_Finalize(); exit(1); } int sizeof_mpi_offset; sizeof_mpi_offset = (int)(sizeof(MPI_Offset)); // 8 //if (rank == 0) printf ("size_of_mpi_offset=%d\n", sizeof_mpi_offset); if(use_normalsto == 1 && use_actsto == 1) { if(rank == 0) printf("Can't test both: either normalsto or actsto\n"); MPI_Finalize(); exit(1); } #if 0 if(use_actsto == 1) { if (size != 1) { if(rank == 0) printf("active storage should be run with only 1 process!!!\n"); MPI_Finalize(); exit(1); } } #endif /* initialize random seed: */ srand(time(NULL)); if(use_gen_file == 1) { int t, result; MPI_File_open( comm, fname, MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh ); /* Set the file view */ disp = rank * nitem * type_size; printf("%d: disp = %lld\n", rank, disp); etype = MPI_DOUBLE; ftype = MPI_DOUBLE; result = MPI_File_set_view(fh, disp, etype, ftype, "native", MPI_INFO_NULL); if(result != MPI_SUCCESS) sample_error(result, "MPI_File_set_view"); buf = (double *)malloc( nitem * sizeof(double) ); if (buf == NULL) { if(rank == 0) printf("malloc() failed\n"); MPI_Finalize(); exit(1); } buf[0] = rand()%4096; if(rank==0) printf("%lf\n", buf[0]); max = min = sum = buf[0]; for(i=1; i<nitem; i++) { t = rand()%4096; if (t>max) max = t; if (t<min) min = t; sum += t; buf[i] = t; if (i<10 && rank == 0) printf("%lf\n", buf[i]); } if(rank == 0) { printf("MPI_Type_size(MPI_DOUBLE)=%d\n", type_size); printf ("max=%lf, min=%lf, sum=%lf\n", max, min, sum); } stime = MPI_Wtime(); /* Write to file */ MPI_File_write_all( fh, buf, nitem, MPI_DOUBLE, &status ); etime = MPI_Wtime(); iotime = etime - stime; printf("%d: iotime (write) = %10.4f\n", rank, iotime); MPI_Get_count( &status, MPI_DOUBLE, &count ); //printf("count = %lld\n", count); if (count != nitem) { fprintf( stderr, "%d: Wrong count (%lld) on write\n", rank, count ); fflush(stderr); /* exit */ MPI_Finalize(); exit(1); } MPI_File_close(&fh); MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) printf("File is written\n\n"); } double *tmp = (double *)malloc( nitem * sizeof(double) ); memset (tmp, 0, nitem*sizeof(double)); if(use_normalsto == 1) { MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh ); /* Read nothing (check status) */ memset( &status, 0xff, sizeof(MPI_Status) ); offset = rank * nitem * type_size; /* start I/O */ stime = MPI_Wtime(); MPI_File_read_at(fh, offset, tmp, nitem, MPI_DOUBLE, &status); etime = MPI_Wtime(); /* end I/O */ iotime = etime - stime; if(_debug==1) printf("%d: iotime = %10.4f\n", rank, iotime); MPI_Reduce(&iotime, &max_iotime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); sum = 0.0; /* reset sum */ /* start computation */ stime = MPI_Wtime(); for(i=0; i<nitem; i++) { sum += tmp[i]; } MPI_Reduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); etime = MPI_Wtime(); /* end computation */ comptime = etime - stime; if(_debug==1) printf("%d: comptime = %10.4f\n", rank, comptime); MPI_Reduce(&comptime, &max_comptime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if(rank == 0) { elapsed_time = max_comptime + max_iotime; printf("<<Result (SUM) with normal read>>\n" "SUM = %10.4f \n" "Computation time = %10.4f sec\n" "I/O time = %10.4f sec\n" "total time = %10.4f sec\n\n", global_sum, max_comptime, max_iotime, elapsed_time); } MPI_File_close(&fh); } #if 0 if(use_actsto == 1) { #if 0 /* MPI_MAX */ MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh ); stime = MPI_Wtime(); MPI_File_read_at_ex( fh, offset, tmp, nitem, MPI_DOUBLE, MPI_MAX, &status ); etime = MPI_Wtime(); elapsed_time = etime-stime; printf ("<<Result with active storage>>\n" "max=%lf (in %10.4f sec)\n", tmp[0], elapsed_time); MPI_File_close(&fh); /* MPI_MIN */ MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh ); stime = MPI_Wtime(); MPI_File_read_at_ex( fh, offset, tmp, nitem, MPI_DOUBLE, MPI_MIN, &status ); etime = MPI_Wtime(); elapsed_time = etime - stime; printf ("min=%lf (in %10.4f sec)\n", tmp[0], elapsed_time); MPI_File_close(&fh); #endif /* MPI_SUM */ MPI_File_open( comm, fname, MPI_MODE_RDWR, MPI_INFO_NULL, &fh ); memset(&status, 0xff, sizeof(MPI_Status)); offset = rank * nitem * type_size; stime = MPI_Wtime(); MPI_File_read_at_ex( fh, offset, tmp, nitem, MPI_DOUBLE, MPI_SUM, &status ); etime = MPI_Wtime(); elapsed_time = etime - stime; printf ("<<Result with active storage>>\n" "sum=%lf (in %10.4f sec)\n", tmp[0], elapsed_time); MPI_File_close( &fh ); } #endif MPI_Barrier(MPI_COMM_WORLD); if (use_gen_file == 1) free( buf ); free( tmp ); MPI_Finalize(); return errs; }
void trainOneEpochDenseCPU(int itask, float *data, float *numerator, float *denominator, float *codebook, unsigned int nSomX, unsigned int nSomY, unsigned int nDimensions, unsigned int nVectors, unsigned int nVectorsPerRank, float radius, float scale, string mapType, string gridType, bool compact_support, bool gaussian, int *globalBmus) { unsigned int p1[2] = {0, 0}; unsigned int *bmus = new unsigned int[nVectorsPerRank * 2]; #ifdef _OPENMP #pragma omp parallel default(shared) private(p1) #endif { #ifdef _OPENMP #pragma omp for #endif #ifdef _WIN32 for (int n = 0; n < nVectorsPerRank; n++) { #else for (unsigned int n = 0; n < nVectorsPerRank; n++) { #endif if (itask * nVectorsPerRank + n < nVectors) { /// get the best matching unit get_bmu_coord(codebook, data, nSomY, nSomX, nDimensions, p1, n); bmus[2 * n] = p1[0]; bmus[2 * n + 1] = p1[1]; } } } float *localNumerator = new float[nSomY * nSomX * nDimensions]; float *localDenominator = new float[nSomY * nSomX]; #ifdef _OPENMP #pragma omp parallel default(shared) #endif { #ifdef _OPENMP #pragma omp for #endif #ifdef _WIN32 for (int som_y = 0; som_y < nSomY; som_y++) { #else for (unsigned int som_y = 0; som_y < nSomY; som_y++) { #endif for (unsigned int som_x = 0; som_x < nSomX; som_x++) { localDenominator[som_y * nSomX + som_x] = 0.0; for (unsigned int d = 0; d < nDimensions; d++) localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] = 0.0; } } /// Accumulate denoms and numers #ifdef _OPENMP #pragma omp for #endif #ifdef _WIN32 for (int som_y = 0; som_y < nSomY; som_y++) { #else for (unsigned int som_y = 0; som_y < nSomY; som_y++) { #endif for (unsigned int som_x = 0; som_x < nSomX; som_x++) { for (unsigned int n = 0; n < nVectorsPerRank; n++) { if (itask * nVectorsPerRank + n < nVectors) { float dist = 0.0f; if (gridType == "rectangular") { if (mapType == "planar") { dist = euclideanDistanceOnPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]); } else if (mapType == "toroid") { dist = euclideanDistanceOnToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY); } } else { if (mapType == "planar") { dist = euclideanDistanceOnHexagonalPlanarMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1]); } else if (mapType == "toroid") { dist = euclideanDistanceOnHexagonalToroidMap(som_x, som_y, bmus[2 * n], bmus[2 * n + 1], nSomX, nSomY); } } float neighbor_fuct = getWeight(dist, radius, scale, compact_support, gaussian); for (unsigned int d = 0; d < nDimensions; d++) { localNumerator[som_y * nSomX * nDimensions + som_x * nDimensions + d] += 1.0f * neighbor_fuct * (*(data + n * nDimensions + d)); } localDenominator[som_y * nSomX + som_x] += neighbor_fuct; } } } } } #ifdef HAVE_MPI MPI_Reduce(localNumerator, numerator, nSomY * nSomX * nDimensions, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(localDenominator, denominator, nSomY * nSomX, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Gather(bmus, nVectorsPerRank * 2, MPI_INT, globalBmus, nVectorsPerRank * 2, MPI_INT, 0, MPI_COMM_WORLD); #else for (unsigned int i = 0; i < nSomY * nSomX * nDimensions; ++i) { numerator[i] = localNumerator[i]; } for (unsigned int i = 0; i < nSomY * nSomX; ++i) { denominator[i] = localDenominator[i]; } for (unsigned int i = 0; i < 2 * nVectorsPerRank; ++i) { globalBmus[i] = bmus[i]; } #endif delete [] bmus; delete [] localNumerator; delete [] localDenominator; }
int main( int argc, char *argv[] ) { int num_errors = 0, total_num_errors = 0; int rank, size; char port1[MPI_MAX_PORT_NAME]; char port2[MPI_MAX_PORT_NAME]; char port3[MPI_MAX_PORT_NAME]; MPI_Status status; MPI_Comm comm1, comm2, comm3; int verbose = 0; int data = 0; if (getenv("MPITEST_VERBOSE")) { verbose = 1; } MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (size < 4) { printf("Four processes needed to run this test.\n"); MPI_Finalize(); return 0; } if (rank == 0) { IF_VERBOSE(("0: opening ports.\n")); MPI_Open_port(MPI_INFO_NULL, port1); MPI_Open_port(MPI_INFO_NULL, port2); MPI_Open_port(MPI_INFO_NULL, port3); IF_VERBOSE(("0: opened port1: <%s>\n", port1)); IF_VERBOSE(("0: opened port2: <%s>\n", port2)); IF_VERBOSE(("0: opened port3: <%s>\n", port3)); IF_VERBOSE(("0: sending ports.\n")); MPI_Send(port1, MPI_MAX_PORT_NAME, MPI_CHAR, 1, 0, MPI_COMM_WORLD); MPI_Send(port2, MPI_MAX_PORT_NAME, MPI_CHAR, 2, 0, MPI_COMM_WORLD); MPI_Send(port3, MPI_MAX_PORT_NAME, MPI_CHAR, 3, 0, MPI_COMM_WORLD); IF_VERBOSE(("0: accepting port3.\n")); MPI_Comm_accept(port3, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm3); IF_VERBOSE(("0: accepting port2.\n")); MPI_Comm_accept(port2, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm2); IF_VERBOSE(("0: accepting port1.\n")); MPI_Comm_accept(port1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm1); IF_VERBOSE(("0: closing ports.\n")); MPI_Close_port(port1); MPI_Close_port(port2); MPI_Close_port(port3); IF_VERBOSE(("0: sending 1 to process 1.\n")); data = 1; MPI_Send(&data, 1, MPI_INT, 0, 0, comm1); IF_VERBOSE(("0: sending 2 to process 2.\n")); data = 2; MPI_Send(&data, 1, MPI_INT, 0, 0, comm2); IF_VERBOSE(("0: sending 3 to process 3.\n")); data = 3; MPI_Send(&data, 1, MPI_INT, 0, 0, comm3); IF_VERBOSE(("0: disconnecting.\n")); MPI_Comm_disconnect(&comm1); MPI_Comm_disconnect(&comm2); MPI_Comm_disconnect(&comm3); } else if (rank == 1) { IF_VERBOSE(("1: receiving port.\n")); MPI_Recv(port1, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status); IF_VERBOSE(("1: received port1: <%s>\n", port1)); IF_VERBOSE(("1: connecting.\n")); MPI_Comm_connect(port1, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm1); MPI_Recv(&data, 1, MPI_INT, 0, 0, comm1, &status); if (data != 1) { printf("Received %d from root when expecting 1\n", data); fflush(stdout); num_errors++; } IF_VERBOSE(("1: disconnecting.\n")); MPI_Comm_disconnect(&comm1); } else if (rank == 2) { IF_VERBOSE(("2: receiving port.\n")); MPI_Recv(port2, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status); IF_VERBOSE(("2: received port2: <%s>\n", port2)); /* make sure process 1 has time to do the connect before this process attempts to connect */ MTestSleep(2); IF_VERBOSE(("2: connecting.\n")); MPI_Comm_connect(port2, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm2); MPI_Recv(&data, 1, MPI_INT, 0, 0, comm2, &status); if (data != 2) { printf("Received %d from root when expecting 2\n", data); fflush(stdout); num_errors++; } IF_VERBOSE(("2: disconnecting.\n")); MPI_Comm_disconnect(&comm2); } else if (rank == 3) { IF_VERBOSE(("3: receiving port.\n")); MPI_Recv(port3, MPI_MAX_PORT_NAME, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status); IF_VERBOSE(("2: received port2: <%s>\n", port2)); /* make sure process 1 and 2 have time to do the connect before this process attempts to connect */ MTestSleep(4); IF_VERBOSE(("3: connecting.\n")); MPI_Comm_connect(port3, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm3); MPI_Recv(&data, 1, MPI_INT, 0, 0, comm3, &status); if (data != 3) { printf("Received %d from root when expecting 3\n", data); fflush(stdout); num_errors++; } IF_VERBOSE(("3: disconnecting.\n")); MPI_Comm_disconnect(&comm3); } MPI_Barrier(MPI_COMM_WORLD); MPI_Reduce(&num_errors, &total_num_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0) { if (total_num_errors) { printf(" Found %d errors\n", total_num_errors); } else { printf(" No Errors\n"); } fflush(stdout); } MPI_Finalize(); return total_num_errors; }
int main(int argc, char *argv[]) { int i, numprocs, rank, size, align_size, disp; int skip; double latency = 0.0, t_start = 0.0, t_stop = 0.0; double timer=0.0; double avg_time = 0.0, max_time = 0.0, min_time = 0.0; char *sendbuf, *recvbuf, *s_buf1, *r_buf1; int *rdispls=NULL, *recvcounts=NULL; int max_msg_size = 1048576, full = 0; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); if (process_args(argc, argv, rank, &max_msg_size, &full)) { MPI_Finalize(); return EXIT_SUCCESS; } if(numprocs < 2) { if(rank == 0) { fprintf(stderr, "This test requires at least two processes\n"); } MPI_Finalize(); return EXIT_FAILURE; } print_header(rank, full); recvcounts=rdispls=NULL; recvcounts = (int *) malloc (numprocs*sizeof(int)); if(NULL == recvcounts) { fprintf(stderr, "malloc failed.\n"); exit(1); } rdispls = (int *) malloc (numprocs*sizeof(int)); if(NULL == rdispls) { fprintf(stderr, "malloc failed.\n"); exit(1); } s_buf1 = r_buf1 = NULL; s_buf1 = (char *) malloc(sizeof(char)*max_msg_size + MAX_ALIGNMENT); if(NULL == s_buf1) { fprintf(stderr, "malloc failed.\n"); exit(1); } r_buf1 = (char *) malloc(sizeof(char)*max_msg_size * numprocs + MAX_ALIGNMENT); if(NULL == r_buf1) { fprintf(stderr, "malloc failed.\n"); exit(1); } align_size = getpagesize(); sendbuf = (char *)(((unsigned long) s_buf1 + (align_size - 1)) / align_size * align_size); recvbuf = (char *)(((unsigned long) r_buf1 + (align_size - 1)) / align_size * align_size); memset(sendbuf, 1, max_msg_size); memset(recvbuf, 0, max_msg_size * numprocs); for(size=1; size <= max_msg_size; size *= 2) { if(size > LARGE_MESSAGE_SIZE) { skip = SKIP_LARGE; iterations = iterations_large; } else { skip = SKIP; } MPI_Barrier(MPI_COMM_WORLD); disp =0; for ( i = 0; i < numprocs; i++) { recvcounts[i] = size; rdispls[i] = disp; disp += size; } MPI_Barrier(MPI_COMM_WORLD); timer=0.0; for(i=0; i < iterations + skip ; i++) { t_start = MPI_Wtime(); MPI_Allgatherv(sendbuf, size, MPI_CHAR, recvbuf, recvcounts, rdispls, MPI_CHAR, MPI_COMM_WORLD); t_stop = MPI_Wtime(); if(i >= skip) { timer+= t_stop-t_start; } MPI_Barrier(MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); latency = (double)(timer * 1e6) / iterations; MPI_Reduce(&latency, &min_time, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); MPI_Reduce(&latency, &max_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&latency, &avg_time, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); avg_time = avg_time/numprocs; print_data(rank, full, size, avg_time, min_time, max_time, iterations); MPI_Barrier(MPI_COMM_WORLD); } free(s_buf1); free(r_buf1); free(recvcounts); free(rdispls); MPI_Finalize(); return EXIT_SUCCESS; }
int main(int argc, char **argv) { int size , rank, number; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD , &size); MPI_Comm_rank(MPI_COMM_WORLD , &rank); double start = MPI_Wtime(); //Starter klokka double umaxglob=0; //Max feil for alle tråder if (argc < 2) { printf("Usage:\n"); printf(" poisson n\n\n"); printf("Arguments:\n"); printf(" n: the problem size (must be a power of 2)\n"); } // The number of grid points in each direction is n+1 // The number of degrees of freedom in each direction is n-1 int n = atoi(argv[1]); int m = n - 1; // ant punk hver vei i B int *cnt = (int *) malloc(size * sizeof(int)); //loakal ant kolonner i matrix int *displs = (int *) malloc((size+1) * sizeof(int)); //lokal displacement for de andre prosessorene sine punkter i sendbuf displs[size] = m; displs[0]=0; //Displacement til første prosessor er alltid 0 int overflow = m % size; //ant kolonner som blir til overs for(int i = 0;i<size;i++){ cnt[i] = m / size; // nrColon for hver prosessor if (overflow != 0){ cnt[i]++; //fordeler de ekstra kolonnene overflow--; } if (i < size-1){ displs[i+1] = displs[i]+cnt[i]; } } int nrColon = cnt[rank]; //ant kolonner "jeg" har int pros_dof = nrColon*m; //ant lementer jeg har int nn = 4 * n; double h = 1.0 / n; // Grid points double *grid = mk_1D_array(n+1, false); double **b = mk_2D_array(nrColon, m, false); double **bt = mk_2D_array(nrColon, m,false); int trad = omp_get_max_threads(); //ant tråder double **z = mk_2D_array(trad,nn, false); //z er 2D pga paralellisering med OpenMP, da FST ikke skal overskrive andre tråders z double *diag = mk_1D_array(m, false); double *sendbuf = mk_1D_array(nrColon*m, false); double *recbuf = mk_1D_array(nrColon*m, false); int *sendcnt = (int *) malloc((size+1) * sizeof(int)); //ant elementer jeg skal sende hver prosessor int *sdispls = (int *) malloc((size+1) * sizeof(int)); //index i sendbuf for hver prosessor sdispls[0]=0; //prosessor 0 skal alltid ha fra index 0 for(int i = 0;i<size;i++){ sendcnt[i] = cnt[i]*cnt[rank]; // antt elementer jeg eier * ant element den eier sdispls[i] = displs[i]*cnt[rank]; //displacement for hver prosessor } // GRID #pragma omp parallel for schedule(static) for (int i = 0; i < n+1; i++) { grid[i] = i * h; } #pragma omp parallel for schedule(static) for (size_t i = 0; i < m; i++) { diag[i] = 2.0 * (1.0 - cos((i+1) * PI / n)); //Eigenvalue } // Initialize the right hand side data #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { for (size_t j = 0; j < m; j++) { // b[i][j] = h * h; b[i][j] = h * h * func1(grid[i+displs[rank]], grid[j]); //evaluerer funksjoen * h*h } } // Calculate Btilde^T = S^-1 * (S * B)^T #pragma omp parallel for schedule(guided, 5) for (size_t i = 0; i < nrColon; i++) { fst_(b[i], &n, z[omp_get_thread_num()], &nn); } MPItranspose (b, bt,nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs); #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { fstinv_(bt[i], &n, z[omp_get_thread_num()], &nn); } // Solve Lambda * Xtilde = Btilde #pragma omp parallel for schedule(static) for (int j=0; j < nrColon; j++) { for (int i=0; i < m; i++) { bt[j][i] /= (diag[j+displs[rank]]+diag[i]); } } // Calculate X = S^-1 * (S * Xtilde^T) #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { fst_(bt[i], &n, z[omp_get_thread_num()], &nn); } MPItranspose (bt, b, nrColon,m, sendbuf,recbuf,sendcnt,sdispls, size, rank, displs); #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { fstinv_(b[i], &n, z[omp_get_thread_num()], &nn); } // Calculate maximal value of solution double u_max = 0.0, temp; #pragma omp parallel for schedule(static) for (size_t i = 0; i < nrColon; i++) { for (size_t j = 0; j < m; j++) { temp = b[i][j] - func2(grid[displs[rank]+i], grid[j]); //tester resultat - kjent funksjon, skal bli = 0 if (temp > u_max){ u_max = temp; } } } MPI_Reduce (&u_max, &umaxglob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); //Finner den største u_max fra de forskjellige prosessorene og setter den til umaxglob MPI_Finalize(); if (rank == 0) { printf("Nodes = %d \n", size); printf("Threads per node = %d \n", omp_get_max_threads()); printf("u_max = %e\n", umaxglob); //Printer max feil double times = MPI_Wtime()-start; //Stopper klokka printf("Time elapsed = %1.16f \n", times); //Pinter tid } return 0; }
void PetscVector<Complex>::localize_to_one (std::vector<Complex>& v_local, const processor_id_type pid) const { this->_restore_array(); PetscErrorCode ierr=0; const PetscInt n = size(); const PetscInt nl = local_size(); PetscScalar *values; v_local.resize(n); for (PetscInt i=0; i<n; i++) v_local[i] = 0.; // only one processor if (n == nl) { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); for (PetscInt i=0; i<n; i++) v_local[i] = static_cast<Complex>(values[i]); ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } // otherwise multiple processors else { numeric_index_type ioff = this->first_local_index(); /* in here the local values are stored, acting as send buffer for MPI * initialize to zero, since we collect using MPI_SUM */ std::vector<Real> real_local_values(n, 0.); std::vector<Real> imag_local_values(n, 0.); { ierr = VecGetArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); // provide my local share to the real and imag buffers for (PetscInt i=0; i<nl; i++) { real_local_values[i+ioff] = static_cast<Complex>(values[i]).real(); imag_local_values[i+ioff] = static_cast<Complex>(values[i]).imag(); } ierr = VecRestoreArray (_vec, &values); CHKERRABORT(libMesh::COMM_WORLD,ierr); } /* have buffers of the real and imaginary part of v_local. * Once MPI_Reduce() collected all the real and imaginary * parts in these std::vector<Real>, the values can be * copied to v_local */ std::vector<Real> real_v_local(n); std::vector<Real> imag_v_local(n); // collect entries from other proc's in real_v_local, imag_v_local MPI_Reduce (&real_local_values[0], &real_v_local[0], n, MPI_REAL, MPI_SUM, pid, libMesh::COMM_WORLD); MPI_Reduce (&imag_local_values[0], &imag_v_local[0], n, MPI_REAL, MPI_SUM, pid, libMesh::COMM_WORLD); // copy real_v_local and imag_v_local to v_local for (PetscInt i=0; i<n; i++) v_local[i] = Complex(real_v_local[i], imag_v_local[i]); } }
void grad(int *n, int nthreads) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); double *data; Complex *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_r2c(n,isize,istart,osize,ostart,c_comm); //data=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); data=(double*)accfft_alloc(alloc_max); data_hat=(Complex*)accfft_alloc(alloc_max); accfft_init(nthreads); /* Create FFT plan */ setup_time=-MPI_Wtime(); accfft_plan * plan=accfft_plan_dft_3d_r2c(n,data,(double*)data_hat,c_comm,ACCFFT_MEASURE); setup_time+=MPI_Wtime(); /* Initialize data */ initialize(data,n,c_comm); MPI_Barrier(c_comm); double * gradx=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double * grady=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double * gradz=(double*)accfft_alloc(isize[0]*isize[1]*isize[2]*sizeof(double)); double timings[5]={0}; std::bitset<3> XYZ=0; XYZ[0]=1; XYZ[1]=1; XYZ[2]=1; double exec_time=-MPI_Wtime(); accfft_grad(gradx,grady,gradz,data,plan,XYZ,timings); exec_time+=MPI_Wtime(); /* Check err*/ PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Gradx>>>>>>>>"<<std::endl; check_err_grad(gradx,n,c_comm,0); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Grady>>>>>>>>"<<std::endl; check_err_grad(grady,n,c_comm,1); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; PCOUT<<">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"<<std::endl; PCOUT<<">>>>>>>>Checking Gradz>>>>>>>>"<<std::endl; check_err_grad(gradz,n,c_comm,2); PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"<<std::endl; PCOUT<<"<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"<<std::endl; /* Compute some timings statistics */ double g_setup_time,g_timings[5],g_exec_time; MPI_Reduce(timings,g_timings,5, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); MPI_Reduce(&exec_time,&g_exec_time,1, MPI_DOUBLE, MPI_MAX,0, c_comm); PCOUT<<"Timing for Grad Computation for size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; PCOUT<<"Setup \t\t"<<g_setup_time<<std::endl; PCOUT<<"Evaluation \t"<<g_exec_time<<std::endl; accfft_free(data); accfft_free(data_hat); MPI_Barrier(c_comm); accfft_free(gradx); accfft_free(grady); accfft_free(gradz); accfft_destroy_plan(plan); accfft_cleanup(); MPI_Comm_free(&c_comm); PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------"<<std::endl; PCOUT<<"-------------------------------------------------------\n"<<std::endl; return ; } // end grad
int main(int argc, char *argv[]) { GRID *g; DOF *u_h; MAT *A, *A0, *B; MAP *map; INT i; size_t nnz, mem, mem_peak; VEC *x, *y0, *y1, *y2; double t0, t1, dnz, dnz1, mflops, mop; char *fn = "../test/cube.dat"; FLOAT mem_max = 300; INT refine = 0; phgOptionsRegisterFilename("-mesh_file", "Mesh file", (char **)&fn); phgOptionsRegisterInt("-loop_count", "Loop count", &loop_count); phgOptionsRegisterInt("-refine", "Refinement level", &refine); phgOptionsRegisterFloat("-mem_max", "Maximum memory", &mem_max); phgInit(&argc, &argv); g = phgNewGrid(-1); if (!phgImport(g, fn, FALSE)) phgError(1, "can't read file \"%s\".\n", fn); phgRefineAllElements(g, refine); u_h = phgDofNew(g, DOF_DEFAULT, 1, "u_h", DofNoAction); while (TRUE) { phgPrintf("\n"); if (phgBalanceGrid(g, 1.2, 1, NULL, 0.)) phgPrintf("Repartition mesh, %d submeshes, load imbalance: %lg\n", g->nprocs, (double)g->lif); map = phgMapCreate(u_h, NULL); A = phgMapCreateMat(map, map); A->handle_bdry_eqns = TRUE; build_matrix(A, u_h); phgMatAssemble(A); /* Note: A is unsymmetric (A' != A) if boundary entries not removed */ phgMatRemoveBoundaryEntries(A); #if 0 /* test block matrix operation */ A0 = phgMatCreateBlockMatrix(g->comm, 1, 1, &A, NULL); #else A0 = A; #endif phgPrintf("%d DOF, %d elems, %d submeshes, matrix size: %d, LIF: %lg\n", DofGetDataCountGlobal(u_h), g->nleaf_global, g->nprocs, A->rmap->nglobal, (double)g->lif); /* test PHG mat-vec multiply */ x = phgMapCreateVec(A->cmap, 1); y1 = phgMapCreateVec(A->rmap, 1); phgVecRandomize(x, 123); phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { phgMatVec(MAT_OP_N, 1.0, A0, x, 0.0, &y1); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); y0 = phgVecCopy(y1, NULL); nnz = A->nnz_d + A->nnz_o; #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif mop = loop_count * (dnz + dnz - A->rmap->nlocal) * 1e-6; phgPrintf("\n"); t1 -= t0; phgPrintf(" PHG: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF)\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops); /* test trans(A)*x */ phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { phgMatVec(MAT_OP_T, 1.0, A0, x, 0.0, &y1); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); t1 -= t0; phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1 == 0 ? 1. : t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); /* time A * trans(A) */ phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); B = phgMatMat(MAT_OP_N, MAT_OP_N, 1.0, A, A, 0.0, NULL); t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); nnz = B->nnz_d + B->nnz_o; #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif /* compare B*x <--> A*A*x */ y2 = phgMatVec(MAT_OP_N, 1.0, B, x, 0.0, NULL); phgMatVec(MAT_OP_N, 1.0, A0, y0, 0.0, &y1); phgMatDestroy(&B); t1 -= t0; phgPrintf(" A*A: time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n", t1, dnz, mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL)); #if USE_PETSC { Mat ma, mb; MatInfo info; Vec va, vb, vc; PetscScalar *vec; ma = phgPetscCreateMatAIJ(A); MatGetVecs(ma, PETSC_NULL, &va); VecDuplicate(va, &vb); VecGetArray(va, &vec); memcpy(vec, x->data, x->map->nlocal * sizeof(*vec)); VecRestoreArray(va, &vec); MatMult(ma, va, vb); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { MatMult(ma, va, vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); VecGetArray(vb, &vec); memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vb, &vec); MatGetInfo(ma, MAT_GLOBAL_SUM, &info); /*phgPrintf(" --------------------------------------------" "-------------------------\n");*/ phgPrintf("\n"); t1 -= t0; dnz = info.nz_used; phgPrintf(" PETSc: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { MatMultTranspose(ma, va, vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); VecGetArray(vb, &vec); memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vb, &vec); t1 -= t0; phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); MatMatMult(ma, ma, MAT_INITIAL_MATRIX, PETSC_DEFAULT, &mb); t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); t1 -= t0; MatGetInfo(mb, MAT_GLOBAL_SUM, &info); dnz = info.nz_used; VecDuplicate(va, &vc); /* compare B*x <--> A*A*x */ MatMult(ma, vb, vc); MatMult(mb, va, vb); VecGetArray(vb, &vec); memcpy(y1->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vb, &vec); VecGetArray(vc, &vec); memcpy(y2->data, vec, x->map->nlocal * sizeof(*vec)); VecRestoreArray(vc, &vec); phgPrintf(" A*A: time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n", t1, dnz, mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL)); phgPetscMatDestroy(&mb); phgPetscMatDestroy(&ma); phgPetscVecDestroy(&va); phgPetscVecDestroy(&vb); phgPetscVecDestroy(&vc); } #endif /* USE_PETSC */ #if USE_HYPRE { HYPRE_IJMatrix ma; HYPRE_IJVector va, vb, vc; HYPRE_ParCSRMatrix par_ma; hypre_ParCSRMatrix *par_mb; HYPRE_ParVector par_va, par_vb, par_vc; HYPRE_Int offset, *ni, start, end; assert(sizeof(INT)==sizeof(int) && sizeof(FLOAT)==sizeof(double)); setup_hypre_mat(A, &ma); ni = phgAlloc(2 * A->rmap->nlocal * sizeof(*ni)); offset = A->cmap->partition[A->cmap->rank]; for (i = 0; i < A->rmap->nlocal; i++) ni[i] = i + offset; HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1, &va); HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1, &vb); HYPRE_IJVectorCreate(g->comm, offset, offset + A->rmap->nlocal - 1, &vc); HYPRE_IJVectorSetObjectType(va, HYPRE_PARCSR); HYPRE_IJVectorSetObjectType(vb, HYPRE_PARCSR); HYPRE_IJVectorSetObjectType(vc, HYPRE_PARCSR); HYPRE_IJVectorSetMaxOffProcElmts(va, 0); HYPRE_IJVectorSetMaxOffProcElmts(vb, 0); HYPRE_IJVectorSetMaxOffProcElmts(vc, 0); HYPRE_IJVectorInitialize(va); HYPRE_IJVectorInitialize(vb); HYPRE_IJVectorInitialize(vc); HYPRE_IJMatrixGetObject(ma, (void **)(void *)&par_ma); HYPRE_IJVectorGetObject(va, (void **)(void *)&par_va); HYPRE_IJVectorGetObject(vb, (void **)(void *)&par_vb); HYPRE_IJVectorGetObject(vc, (void **)(void *)&par_vc); HYPRE_IJVectorSetValues(va, A->cmap->nlocal, ni, (double *)x->data); HYPRE_IJVectorAssemble(va); HYPRE_IJVectorAssemble(vb); HYPRE_IJVectorAssemble(vc); HYPRE_IJMatrixGetRowCounts(ma, A->cmap->nlocal, ni, ni + A->rmap->nlocal); for (i = 0, nnz = 0; i < A->rmap->nlocal; i++) nnz += ni[A->rmap->nlocal + i]; #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_va, 0.0, par_vb); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_va, 0.0, par_vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data); /*phgPrintf(" --------------------------------------------" "-------------------------\n");*/ phgPrintf("\n"); t1 -= t0; phgPrintf(" HYPRE: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); for (i = 0; i < loop_count; i++) { HYPRE_ParCSRMatrixMatvecT(1.0, par_ma, par_va, 0.0, par_vb); } t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data); t1 -= t0; phgPrintf(" A'*x: time %0.4lf, nnz %0.16lg, %0.2lfMF (%0.2lfMF), " "err: %le\n", t1, dnz, mop / (t1==0 ? 1.:t1), mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y0, 1.0, &y1), 0, NULL)); phgPerfGetMflops(g, NULL, NULL); /* reset flops counter */ t0 = phgGetTime(NULL); /* Note: 'HYPRE_ParCSRMatrix' is currently typedef'ed to * 'hypre_ParCSRMatrix *' */ par_mb = hypre_ParMatmul((hypre_ParCSRMatrix *)par_ma, (hypre_ParCSRMatrix *)par_ma); t1 = phgGetTime(NULL); mflops = phgPerfGetMflops(g, NULL, NULL); start = hypre_ParCSRMatrixFirstRowIndex(par_mb); end = hypre_ParCSRMatrixLastRowIndex(par_mb) + 1; for (i = start, nnz = 0; i < end; i++) { HYPRE_Int ncols; hypre_ParCSRMatrixGetRow(par_mb, i, &ncols, NULL, NULL); hypre_ParCSRMatrixRestoreRow(par_mb, i, &ncols, NULL, NULL); nnz += ncols; } #if USE_MPI dnz1 = nnz; MPI_Reduce(&dnz1, &dnz, 1, MPI_DOUBLE, MPI_SUM, 0, g->comm); #else dnz = nnz; #endif /* compare B*x <--> A*A*x */ HYPRE_ParCSRMatrixMatvec(1.0, par_ma, par_vb, 0.0, par_vc); HYPRE_ParCSRMatrixMatvec(1.0, (void *)par_mb, par_va, 0.0, par_vb); HYPRE_IJVectorGetValues(vb, A->rmap->nlocal, ni, (double*)y1->data); HYPRE_IJVectorGetValues(vc, A->rmap->nlocal, ni, (double*)y2->data); hypre_ParCSRMatrixDestroy((par_mb)); t1 -= t0; phgPrintf(" A*A: time %0.4lf, nnz %0.16lg, %0.2lfMF, err: %le\n", t1, dnz, mflops, (double)phgVecNorm2(phgVecAXPBY(-1.0, y1, 1.0, &y2), 0, NULL)); phgFree(ni); HYPRE_IJMatrixDestroy(ma); HYPRE_IJVectorDestroy(va); HYPRE_IJVectorDestroy(vb); HYPRE_IJVectorDestroy(vc); } #endif /* USE_HYPRE */ if (A0 != A) phgMatDestroy(&A0); #if 0 if (A->rmap->nglobal > 1000) { VEC *v = phgMapCreateVec(A->rmap, 3); for (i = 0; i < v->map->nlocal; i++) { v->data[i + 0 * v->map->nlocal] = 1 * (i + v->map->partition[g->rank]); v->data[i + 1 * v->map->nlocal] = 2 * (i + v->map->partition[g->rank]); v->data[i + 2 * v->map->nlocal] = 3 * (i + v->map->partition[g->rank]); } phgMatDumpMATLAB(A, "A", "A.m"); phgVecDumpMATLAB(v, "v", "v.m"); phgFinalize(); exit(0); } #endif phgMatDestroy(&A); phgVecDestroy(&x); phgVecDestroy(&y0); phgVecDestroy(&y1); phgVecDestroy(&y2); phgMapDestroy(&map); mem = phgMemoryUsage(g, &mem_peak); dnz = mem / (1024.0 * 1024.0); dnz1 = mem_peak / (1024.0 * 1024.0); /*phgPrintf(" --------------------------------------------" "-------------------------\n");*/ phgPrintf("\n"); phgPrintf(" Memory: current %0.4lgMB, peak %0.4lgMB\n", dnz, dnz1); #if 0 { static int loop_count = 0; if (++loop_count == 4) break; } #endif if (mem_peak > 1024 * (size_t)1024 * mem_max) break; phgRefineAllElements(g, 1); } phgDofFree(&u_h); phgFreeGrid(&g); phgFinalize(); return 0; }
void Master::collectStats() { int64_t dummy = 0; MPI_Reduce(&dummy, &engine.conflicts, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&dummy, &engine.propagations, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&dummy, &engine.opt_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); }
int main(int argc, char ** argv) { int Block_order; size_t Block_size; size_t Colblock_size; int Tile_order=32; int tiling; int Num_procs; /* Number of ranks */ int order; /* overall matrix order */ int send_to, recv_from; /* communicating ranks */ size_t bytes; /* total amount of data to be moved */ int my_ID; /* rank */ int root=0; /* root rank of a communicator */ int iterations; /* number of times to run the pipeline algorithm */ int i, j, it, jt, ID;/* dummies */ int iter; /* index of iteration */ int phase; /* phase in the staged communication */ size_t colstart; /* sequence number of first column owned by calling rank */ int error=0; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double *Work_in_p; /* workspace for the transpose function */ double *Work_out_p;/* workspace for the transpose function */ double abserr, abserr_tot; /* computed error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Status status; /* completion status of message */ MPI_Win shm_win_A; /* Shared Memory window object */ MPI_Win shm_win_B; /* Shared Memory window object */ MPI_Win shm_win_Work_in; /* Shared Memory window object */ MPI_Win shm_win_Work_out; /* Shared Memory window object */ MPI_Info rma_winfo;/* info for window */ MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of ranks in shared domain */ int shm_ID; /* MPI rank within coherence domain */ int group_size; /* number of ranks per shared memory group */ int Num_groups; /* number of shared memory group */ int group_ID; /* sequence number of shared memory group */ int size_mul; /* size multiplier; 0 for non-root ranks in coherence domain*/ int istart; MPI_Request send_req, recv_req; /********************************************************************************* ** Initialize the MPI environment **********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); root = 0; /********************************************************************* ** process, test and broadcast input parameter *********************************************************************/ if (my_ID == root){ if (argc != 4 && argc !=5){ printf("Usage: %s <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc == 5) Tile_order = atoi(*++argv); ENDOFTESTS:; } bail_out(error); /* Broadcast input data to all ranks */ MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM Matrix transpose: B = A^T\n"); printf("Number of ranks = %d\n", Num_procs); printf("Rank group size = %d\n", group_size); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); #ifndef SYNCHRONOUS printf("Non-"); #endif printf("Blocking messages\n"); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that a SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Num_groups = Num_procs/group_size; Block_order = order/Num_groups; group_ID = my_ID/group_size; colstart = Block_order * group_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the column block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ /* RMA win info */ MPI_Info_create(&rma_winfo); /* This key indicates that passive target RMA will not be used. * It is the one info key that MPICH actually uses for optimization. */ MPI_Info_set(rma_winfo, "no_locks", "true"); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); int offset = 32; MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &A_p, &shm_win_A); MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); A_p += offset; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &B_p, &shm_win_B); MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p); if (B_p == NULL){ printf(" Error allocating space for transposed matrix by group %d\n",group_ID); error = 1; } bail_out(error); B_p += offset; if (Num_groups>1) { size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, (void *) &Work_in_p, &shm_win_Work_in); MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_in_p); if (Work_in_p == NULL){ printf(" Error allocating space for in block by group %d\n",group_ID); error = 1; } bail_out(error); MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &Work_out_p, &shm_win_Work_out); MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_out_p); if (Work_out_p == NULL){ printf(" Error allocating space for out block by group %d\n",group_ID); error = 1; } bail_out(error); } /* Fill the original column matrix */ istart = 0; int chunk_size = Block_order/group_size; if (tiling) { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) { for (i=0;i<order; i+=Tile_order) for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++) for (it=i; it<MIN(order,i+Tile_order); it++) { A(it,jt) = (double) (order*(jt+colstart) + it); B(it,jt) = -1.0; } } } else { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = -1.0; } } /* NEED A STORE FENCE HERE */ MPI_Barrier(shm_comm); for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) { for (j=0; j<Block_order; j++) B(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) = A(it,jt); } } } for (phase=1; phase<Num_groups; phase++){ recv_from = ((group_ID + phase )%Num_groups); send_to = ((group_ID - phase + Num_groups)%Num_groups); #ifndef SYNCHRONOUS if (shm_ID==0) { MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req); } #endif istart = send_to*Block_order; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); } } /* NEED A LOAD/STORE FENCE HERE */ MPI_Barrier(shm_comm); if (shm_ID==0) { #ifndef SYNCHRONOUS MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, MPI_COMM_WORLD, &send_req); MPI_Wait(&recv_req, &status); MPI_Wait(&send_req, &status); #else MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &status); #endif } /* NEED A LOAD FENCE HERE */ MPI_Barrier(shm_comm); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0; i<Block_order; i++) B(i,j) = Work_in(i,j); } /* end of phase loop */ } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; /* for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0;i<order; i++) { abserr += ABS(B(i,j) - (double)(order*i + j+colstart)); } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr_tot); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); MPI_Win_free(&shm_win_A); MPI_Win_free(&shm_win_B); if (Num_groups>1) { MPI_Win_free(&shm_win_Work_in); MPI_Win_free(&shm_win_Work_out); } MPI_Info_free(&rma_winfo); MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char *argv[]) { bool wantwf, verb; int ix, iz, is, it, wfit, im, ik, i, j, itau; int ns, nx, nz, nt, wfnt, rnx, rnz, nzx, rnzx, vnx, ntau, htau, nds; int scalet, snap, snapshot, fnx, fnz, fnzx, nk, nb; int rectx, rectz, gpz, n, m, pad1, trunc, spx, spz; float dt, t0, z0, dz, x0, dx, s0, ds, wfdt, srctrunc; float dtau, tau0, tau; char *path1, *path2, number[5], *left, *right; double tstart, tend; struct timeval tim; sf_complex c, **lt, **rt; sf_complex *ww, **dd; float ***img1, **img2, ***mig1, **mig2; float *rr, **ccr, **sill, ***fwf, ***bwf; sf_complex *cwave, *cwavem, **wave, *curr; sf_axis at, ax, az, atau; sf_file Fdat, Fsrc, Fimg1, Fimg2; sf_file Ffwf, Fbwf, Fvel; sf_file Fleft, Fright; int cpuid, numprocs, nth; float *sendbuf, *recvbuf; MPI_Comm comm=MPI_COMM_WORLD; MPI_Init(&argc, &argv); MPI_Comm_rank(comm, &cpuid); MPI_Comm_size(comm, &numprocs); sf_init(argc, argv); #ifdef _OPENMP #pragma omp parallel { nth=omp_get_num_threads(); } sf_warning(">>> Using %d threads <<<", nth); #endif gettimeofday(&tim, NULL); tstart=tim.tv_sec+(tim.tv_usec/1000000.0); if(!sf_getbool("wantwf", &wantwf)) wantwf=false; if(!sf_getbool("verb", &verb)) verb=false; if(!sf_getint("pad1", &pad1)) pad1=1; /* padding factor on the first axis */ if(!sf_getint("nb", &nb)) sf_error("Need nb= "); if(!sf_getfloat("srctrunc", &srctrunc)) srctrunc=0.4; if(!sf_getint("rectx", &rectx)) rectx=2; if(!sf_getint("rectz", &rectz)) rectz=2; if(!sf_getint("scalet", &scalet)) scalet=1; if(!sf_getint("snap", &snap)) snap=100; /* interval of the output wavefield */ if(!sf_getint("snapshot", &snapshot)) snapshot=0; /* print out the wavefield snapshots of this shot */ if(!sf_getint("nds", &nds)) sf_error("Need nds=!"); /* source and receiver positions */ if(!sf_getint("gpz", &gpz)) sf_error("Need gpz="); if(!sf_getint("spx", &spx)) sf_error("Need spx="); if(!sf_getint("spz", &spz)) sf_error("Need spz="); /* tau parameters */ if(!sf_getint("ntau", &ntau)) sf_error("Need ntau="); if(!sf_getfloat("dtau", &dtau)) sf_error("Need dtau="); if(!sf_getfloat("tau0", &tau0)) sf_error("Need tau0="); /* input/output files */ Fdat=sf_input("in"); Fimg1=sf_output("out"); Fimg2=sf_output("Fimg2"); Fsrc=sf_input("Fsrc"); Fvel=sf_input("Fpadvel"); if(wantwf){ Ffwf=sf_output("Ffwf"); Fbwf=sf_output("Fbwf"); } at=sf_iaxa(Fsrc, 1); nt=sf_n(at); dt=sf_d(at); t0=sf_o(at); ax=sf_iaxa(Fvel, 2); vnx=sf_n(ax); x0=sf_o(ax); az=sf_iaxa(Fvel, 1); rnz=sf_n(az); dz=sf_d(az); z0=sf_o(az); if(!sf_histint(Fdat, "n2", &rnx)) sf_error("Need n2= in input!"); if(!sf_histfloat(Fdat, "d2", &dx)) sf_error("Need d2= in input!"); if(!sf_histint(Fdat, "n3", &ns)) sf_error("Need n3= in input!"); if(!sf_histfloat(Fdat, "d3", &ds)) sf_error("Need d3= in input!"); if(!sf_histfloat(Fdat, "o3", &s0)) sf_error("Need o3= in input!"); wfnt=(nt-1)/scalet+1; wfdt=dt*scalet; /* double check the geometry parameters */ if(nds != (int)(ds/dx)) sf_error("Need ds/dx= %d", nds); sf_warning("s0=%g, x0+(rnx-1)*dx/2=%g", s0, x0+(rnx-1)*dx/2); //if(s0 != x0+(rnx-1)*dx/2) sf_error("Wrong origin information!"); if(vnx != nds*(ns-1)+rnx) sf_error("Wrong dimension in x axis!"); /* set up the output files */ atau=sf_iaxa(Fsrc, 1); sf_setn(atau, ntau); sf_setd(atau, dtau); sf_seto(atau, tau0); sf_setlabel(atau, "Tau"); sf_setunit(atau, "s"); sf_oaxa(Fimg1, az, 1); sf_oaxa(Fimg1, ax, 2); sf_oaxa(Fimg1, atau, 3); sf_oaxa(Fimg2, az, 1); sf_oaxa(Fimg2, ax, 2); sf_putint(Fimg2, "n3", 1); sf_settype(Fimg1, SF_FLOAT); sf_settype(Fimg2, SF_FLOAT); if(wantwf){ sf_setn(ax, rnx); sf_seto(ax, -(rnx-1)*dx/2.0); sf_oaxa(Ffwf, az, 1); sf_oaxa(Ffwf, ax, 2); sf_putint(Ffwf, "n3", (wfnt-1)/snap+1); sf_putfloat(Ffwf, "d3", snap*wfdt); sf_putfloat(Ffwf, "o3", t0); sf_putstring(Ffwf, "label3", "Time"); sf_putstring(Ffwf, "unit3", "s"); sf_settype(Ffwf, SF_FLOAT); sf_oaxa(Fbwf, az, 1); sf_oaxa(Fbwf, ax, 2); sf_putint(Fbwf, "n3", (wfnt-1)/snap+1); sf_putfloat(Fbwf, "d3", -snap*wfdt); sf_putfloat(Fbwf, "o3", (wfnt-1)*wfdt); sf_putstring(Fbwf, "label3", "Time"); sf_putstring(Fbwf, "unit3", "s"); sf_settype(Fbwf, SF_FLOAT); } nx=rnx+2*nb; nz=rnz+2*nb; nzx=nx*nz; rnzx=rnz*rnx; nk=cfft2_init(pad1, nz, nx, &fnz, &fnx); fnzx=fnz*fnx; /* print axies parameters for double check */ sf_warning("cpuid=%d, numprocs=%d", cpuid, numprocs); sf_warning("nt=%d, dt=%g, scalet=%d, wfnt=%d, wfdt=%g",nt, dt, scalet, wfnt, wfdt); sf_warning("vnx=%d, nx=%d, dx=%g, nb=%d, rnx=%d", vnx, nx, dx, nb, rnx); sf_warning("nz=%d, rnz=%d, dz=%g, z0=%g", nz, rnz, dz, z0); sf_warning("spx=%d, spz=%d, gpz=%d", spx, spz, gpz); sf_warning("ns=%d, ds=%g, s0=%g", ns, ds, s0); sf_warning("ntau=%d, dtau=%g, tau0=%g", ntau, dtau, tau0); sf_warning("nzx=%d, fnzx=%d, nk=%d", nzx, fnzx, nk); /* allocate storage and read data */ ww=sf_complexalloc(nt); sf_complexread(ww, nt, Fsrc); sf_fileclose(Fsrc); gpz=gpz+nb; spz=spz+nb; spx=spx+nb; trunc=srctrunc/dt+0.5; dd=sf_complexalloc2(nt, rnx); rr=sf_floatalloc(nzx); reflgen(nz, nx, spz, spx, rectz, rectx, 0, rr); fwf=sf_floatalloc3(rnz, rnx, wfnt); bwf=sf_floatalloc3(rnz, rnx, wfnt); img1=sf_floatalloc3(rnz, vnx, ntau); img2=sf_floatalloc2(rnz, vnx); mig1=sf_floatalloc3(rnz, rnx, ntau); mig2=sf_floatalloc2(rnz, rnx); ccr=sf_floatalloc2(rnz, rnx); sill=sf_floatalloc2(rnz, rnx); curr=sf_complexalloc(fnzx); cwave=sf_complexalloc(nk); cwavem=sf_complexalloc(nk); icfft2_allocate(cwavem); #ifdef _OPENMP #pragma omp parallel for private(ix, iz, itau) #endif for(ix=0; ix<vnx; ix++){ for(iz=0; iz<rnz; iz++){ img2[ix][iz]=0.; for(itau=0; itau<ntau; itau++){ img1[itau][ix][iz]=0.; } } } path1=sf_getstring("path1"); path2=sf_getstring("path2"); if(path1==NULL) path1="./mat/left"; if(path2==NULL) path2="./mat/right"; /* shot loop */ for (is=cpuid; is<ns; is+=numprocs){ /* construct the names of left and right matrices */ left=sf_charalloc(strlen(path1)); right=sf_charalloc(strlen(path2)); strcpy(left, path1); strcpy(right, path2); sprintf(number, "%d", is+1); strcat(left, number); strcat(right, number); Fleft=sf_input(left); Fright=sf_input(right); if(!sf_histint(Fleft, "n1", &n) || n != nzx) sf_error("Need n1=%d in Fleft", nzx); if(!sf_histint(Fleft, "n2", &m)) sf_error("No n2 in Fleft"); if(!sf_histint(Fright, "n1", &n) || n != m) sf_error("Need n1=%d in Fright", m); if(!sf_histint(Fright, "n2", &n) || n != nk) sf_error("Need n2=%d in Fright", nk); /* allocate storage for each shot migration */ lt=sf_complexalloc2(nzx, m); rt=sf_complexalloc2(m, nk); sf_complexread(lt[0], nzx*m, Fleft); sf_complexread(rt[0], m*nk, Fright); sf_fileclose(Fleft); sf_fileclose(Fright); /* read data */ sf_seek(Fdat, is*rnx*nt*sizeof(float complex), SEEK_SET); sf_complexread(dd[0], rnx*nt, Fdat); /* initialize curr and imaging variables */ #ifdef _OPENMP #pragma omp parallel for private(iz) #endif for(iz=0; iz<fnzx; iz++){ curr[iz]=sf_cmplx(0.,0.); } #ifdef _OPENMP #pragma omp parallel for private(ix, iz, itau) #endif for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ mig2[ix][iz]=0.; ccr[ix][iz]=0.; sill[ix][iz]=0.; for(itau=0; itau<ntau; itau++){ mig1[itau][ix][iz]=0.; } } } /* wave */ wave=sf_complexalloc2(fnzx, m); /* snapshot */ if(wantwf && is== snapshot) wantwf=true; else wantwf=false; wfit=0; for(it=0; it<nt; it++){ if(verb) sf_warning("Forward propagation it=%d/%d",it+1, nt); cfft2(curr, cwave); for(im=0; im<m; im++){ #ifdef _OPENMP #pragma omp parallel for private(ik) #endif for(ik=0; ik<nk; ik++){ #ifdef SF_HAS_COMPLEX_H cwavem[ik]=cwave[ik]*rt[ik][im]; #else cwavem[ik]=sf_cmul(cwave[ik],rt[ik][im]); #endif } icfft2(wave[im],cwavem); } #ifdef _OPENMP #pragma omp parallel for private(ix, iz, i, j, im, c) shared(curr, it) #endif for(ix=0; ix<nx; ix++){ for(iz=0; iz<nz; iz++){ i=iz+ix*nz; j=iz+ix*fnz; if(it<trunc){ #ifdef SF_HAS_COMPLEX_H c=ww[it]*rr[i]; #else c=sf_crmul(ww[it],rr[i]); #endif }else{ c=sf_cmplx(0.,0.); } // c += curr[j]; for(im=0; im<m; im++){ #ifdef SF_HAS_COMPLEX_H c += lt[im][i]*wave[im][j]; #else c += sf_cmul(lt[im][i], wave[im][j]); #endif } curr[j]=c; } } if(it%scalet==0){ #ifdef _OPENMP #pragma omp parallel for private(ix, iz) #endif for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ fwf[wfit][ix][iz]=crealf(curr[(ix+nb)*fnz+(iz+nb)]); } } wfit++; } } //end of it /* check wfnt */ if(wfit != wfnt) sf_error("At this point, wfit should be equal to wfnt"); /* backward propagation starts from here... */ #ifdef _OPENMP #pragma omp parallel for private(iz) #endif for(iz=0; iz<fnzx; iz++){ curr[iz]=sf_cmplx(0.,0.); } wfit=wfnt-1; for(it=nt-1; it>=0; it--){ if(verb) sf_warning("Backward propagation it=%d/%d",it+1, nt); #ifdef _OPENMP #pragma omp parallel for private(ix) #endif for(ix=0; ix<rnx; ix++){ curr[(ix+nb)*fnz+gpz]+=dd[ix][it]; } cfft2(curr, cwave); for(im=0; im<m; im++){ #ifdef _OPENMP #pragma omp parallel for private(ik) #endif for(ik=0; ik<nk; ik++){ #ifdef SF_HAS_COMPLEX_H cwavem[ik]=cwave[ik]*conjf(rt[ik][im]); #else cwavem[ik]=sf_cmul(cwave[ik],conjf(rt[ik][im])); #endif } icfft2(wave[im],cwavem); } #ifdef _OPENMP #pragma omp parallel for private(ix, iz, i, j, im, c) shared(curr, it) #endif for(ix=0; ix<nx; ix++){ for(iz=0; iz<nz; iz++){ i=iz+ix*nz; j=iz+ix*fnz; // c=curr[j]; c=sf_cmplx(0.,0.); for(im=0; im<m; im++){ #ifdef SF_HAS_COMPLEX_H c += conjf(lt[im][i])*wave[im][j]; #else c += sf_cmul(conjf(lt[im][i]), wave[im][j]); #endif } curr[j]=c; } } if(it%scalet==0){ #ifdef _OPENMP #pragma omp parallel for private(ix, iz) #endif for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ bwf[wfit][ix][iz]=crealf(curr[(ix+nb)*fnz+(iz+nb)]); ccr[ix][iz] += fwf[wfit][ix][iz]*bwf[wfit][ix][iz]; sill[ix][iz] += fwf[wfit][ix][iz]*fwf[wfit][ix][iz]; } } wfit--; } } //end of it if(wfit != -1) sf_error("Check program! The final wfit should be -1!"); /* free storage */ free(*rt); free(rt); free(*lt); free(lt); free(*wave); free(wave); free(left); free(right); /* normalized image */ #ifdef _OPENMP #pragma omp parallel for private(ix, iz) #endif for (ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ mig2[ix][iz]=ccr[ix][iz]/(sill[ix][iz]+SF_EPS); // sill[ix][iz]=0.; } } /* calculate time shift gathers */ for(itau=0; itau<ntau; itau++){ sf_warning("itau/ntau=%d/%d", itau+1, ntau); tau=itau*dtau+tau0; htau=tau/wfdt; for(it=abs(htau); it<wfnt-abs(htau); it++){ #ifdef _OPENMP #pragma omp parallel for private(ix, iz) #endif for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ mig1[itau][ix][iz]+=fwf[it+htau][ix][iz]*bwf[it-htau][ix][iz]; // sill[ix][iz]+=fwf[it+htau][ix][iz]*fwf[it+htau][ix][iz]; } // end of iz } // end of ix } // end of it /* #ifdef _OPENMP #pragma omp parallel for private(ix, iz) #endif */ /* source illumination for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ mig1[itau][ix][iz] = mig1[itau][ix][iz]/(sill[ix][iz]+SF_EPS); } } */ } //end of itau /* output wavefield snapshot */ if(wantwf){ for(it=0; it<wfnt; it++){ if(it%snap==0){ sf_floatwrite(fwf[it][0], rnzx, Ffwf); sf_floatwrite(bwf[wfnt-1-it][0], rnzx, Fbwf); } } sf_fileclose(Ffwf); sf_fileclose(Fbwf); } /* add all the shot images that are on the same node */ #ifdef _OPENMP #pragma omp parallel for private(itau, ix, iz) #endif for(itau=0; itau<ntau; itau++){ for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ img1[itau][ix+is*nds][iz] += mig1[itau][ix][iz]; } } } #ifdef _OPENMP #pragma omp parallel for private(ix, iz) #endif for(ix=0; ix<rnx; ix++){ for(iz=0; iz<rnz; iz++){ img2[ix+is*nds][iz] += mig2[ix][iz]; } } } ////////////////end of ishot MPI_Barrier(comm); cfft2_finalize(); sf_fileclose(Fdat); free(ww); free(rr); free(*dd); free(dd); free(cwave); free(cwavem); free(curr); free(*ccr); free(ccr); free(*sill); free(sill); free(**fwf); free(*fwf); free(fwf); free(**bwf); free(*bwf); free(bwf); free(**mig1); free(*mig1); free(mig1); free(*mig2); free(mig2); /* sum image */ if(cpuid==0){ sendbuf=(float *)MPI_IN_PLACE; recvbuf=img1[0][0]; }else{ sendbuf=img1[0][0]; recvbuf=NULL; } MPI_Reduce(sendbuf, recvbuf, ntau*vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm); if(cpuid==0){ sendbuf=MPI_IN_PLACE; recvbuf=img2[0]; }else{ sendbuf=img2[0]; recvbuf=NULL; } MPI_Reduce(sendbuf, recvbuf, vnx*rnz, MPI_FLOAT, MPI_SUM, 0, comm); /* output image */ if(cpuid==0){ sf_floatwrite(img1[0][0], ntau*vnx*rnz, Fimg1); sf_floatwrite(img2[0], vnx*rnz, Fimg2); } MPI_Barrier(comm); sf_fileclose(Fimg1); sf_fileclose(Fimg2); free(**img1); free(*img1); free(img1); free(*img2); free(img2); gettimeofday(&tim, NULL); tend=tim.tv_sec+(tim.tv_usec/1000000.0); sf_warning(">> The computing time is %.3lf minutes <<", (tend-tstart)/60.); MPI_Finalize(); exit(0); }
double calc_forces(double* xi_opt, double* forces, int flag) { double tmpsum, sum = 0.0; int first, col, ne, size, i = flag; double* xi = NULL; apot_table_t* apt = &g_pot.apot_table; double charge[g_param.ntypes]; double sum_charges; double dp_kappa; #if defined(DIPOLE) double dp_alpha[g_param.ntypes]; double dp_b[g_calc.paircol]; double dp_c[g_calc.paircol]; #endif // DIPOLE static double rho_sum_loc, rho_sum; rho_sum_loc = rho_sum = 0.0; switch (g_pot.format_type) { case POTENTIAL_FORMAT_UNKNOWN: error(1, "Unknown potential format detected! (%s:%d)\n", __FILE__, __LINE__); case POTENTIAL_FORMAT_ANALYTIC: xi = g_pot.calc_pot.table; break; case POTENTIAL_FORMAT_TABULATED_EQ_DIST: case POTENTIAL_FORMAT_TABULATED_NON_EQ_DIST: xi = xi_opt; break; case POTENTIAL_FORMAT_KIM: error(1, "KIM format is not supported by EAM elstat force routine!"); } #if !defined(MPI) g_mpi.myconf = g_config.nconf; #endif // MPI ne = g_pot.apot_table.total_ne_par; size = apt->number; /* This is the start of an infinite loop */ while (1) { tmpsum = 0.0; /* sum of squares of local process */ rho_sum_loc = 0.0; #if defined APOT && !defined MPI if (g_pot.format_type == POTENTIAL_FORMAT_ANALYTIC) { apot_check_params(xi_opt); update_calc_table(xi_opt, xi, 0); } #endif // APOT && !MPI #if defined(MPI) /* exchange potential and flag value */ #if !defined(APOT) MPI_Bcast(xi, g_pot.calc_pot.len, MPI_DOUBLE, 0, MPI_COMM_WORLD); #endif // APOT MPI_Bcast(&flag, 1, MPI_INT, 0, MPI_COMM_WORLD); if (flag == 1) break; /* Exception: flag 1 means clean up */ #if defined(APOT) if (g_mpi.myid == 0) apot_check_params(xi_opt); MPI_Bcast(xi_opt, g_calc.ndimtot, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (g_pot.format_type == POTENTIAL_FORMAT_ANALYTIC) update_calc_table(xi_opt, xi, 0); #else /* APOT */ /* if flag==2 then the potential parameters have changed -> sync */ if (flag == 2) potsync(); #endif // APOT #endif // MPI /* local arrays for electrostatic parameters */ sum_charges = 0; for (i = 0; i < g_param.ntypes - 1; i++) { if (xi_opt[2 * size + ne + i]) { charge[i] = xi_opt[2 * size + ne + i]; sum_charges += apt->ratio[i] * charge[i]; } else { charge[i] = 0.0; } } apt->last_charge = -sum_charges / apt->ratio[g_param.ntypes - 1]; charge[g_param.ntypes - 1] = apt->last_charge; if (xi_opt[2 * size + ne + g_param.ntypes - 1]) { dp_kappa = xi_opt[2 * size + ne + g_param.ntypes - 1]; } else { dp_kappa = 0.0; } #if defined(DIPOLE) for (i = 0; i < g_param.ntypes; i++) { if (xi_opt[2 * size + ne + g_param.ntypes + i]) { dp_alpha[i] = xi_opt[2 * size + ne + g_param.ntypes + i]; } else { dp_alpha[i] = 0.0; } } for (i = 0; i < g_calc.paircol; i++) { if (xi_opt[2 * size + ne + 2 * g_param.ntypes + i]) { dp_b[i] = xi_opt[2 * size + ne + 2 * g_param.ntypes + i]; } else { dp_b[i] = 0.0; } if (xi_opt[2 * size + ne + 2 * g_param.ntypes + g_calc.paircol + i]) { dp_c[i] = xi_opt[2 * size + ne + 2 * g_param.ntypes + g_calc.paircol + i]; } else { dp_c[i] = 0.0; } } #endif // DIPOLE /* init second derivatives for splines */ /* pair potentials & rho */ for (col = 0; col < g_calc.paircol + g_param.ntypes; col++) { first = g_pot.calc_pot.first[col]; switch (g_pot.format_type) { case POTENTIAL_FORMAT_UNKNOWN: error(1, "Unknown potential format detected! (%s:%d)\n", __FILE__, __LINE__); case POTENTIAL_FORMAT_ANALYTIC: case POTENTIAL_FORMAT_TABULATED_EQ_DIST: { spline_ed(g_pot.calc_pot.step[col], xi + first, g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2), 0.0, g_pot.calc_pot.d2tab + first); break; } case POTENTIAL_FORMAT_TABULATED_NON_EQ_DIST: { spline_ne(g_pot.calc_pot.xcoord + first, xi + first, g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2), 0.0, g_pot.calc_pot.d2tab + first); } case POTENTIAL_FORMAT_KIM: error(1, "KIM format is not supported by EAM elstat force routine!"); } } /* F */ for (col = g_calc.paircol + g_param.ntypes; col < g_calc.paircol + 2 * g_param.ntypes; col++) { first = g_pot.calc_pot.first[col]; /* gradient at left boundary matched to square root function, when 0 not in domain(F), else natural spline */ switch (g_pot.format_type) { case POTENTIAL_FORMAT_UNKNOWN: error(1, "Unknown potential format detected! (%s:%d)\n", __FILE__, __LINE__); case POTENTIAL_FORMAT_ANALYTIC: case POTENTIAL_FORMAT_TABULATED_EQ_DIST: { spline_ed(g_pot.calc_pot.step[col], xi + first, g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2), *(xi + first - 1), g_pot.calc_pot.d2tab + first); break; } case POTENTIAL_FORMAT_TABULATED_NON_EQ_DIST: { spline_ne(g_pot.calc_pot.xcoord + first, xi + first, g_pot.calc_pot.last[col] - first + 1, *(xi + first - 2), *(xi + first - 1), g_pot.calc_pot.d2tab + first); } case POTENTIAL_FORMAT_KIM: error(1, "KIM format is not supported by EAM elstat force routine!"); } } /* region containing loop over configurations */ { int self; vector tmp_force; int h, j, type1, type2, uf; #if defined(STRESS) int us = 0; int stresses = 0; #endif int n_i, n_j; double fnval, grad, fnval_tail, grad_tail, grad_i, grad_j; #if defined(DIPOLE) double p_sr_tail = 0.0; #endif atom_t* atom; neigh_t* neigh; double r; int col_F; double eam_force; double rho_val, rho_grad, rho_grad_j; /* loop over configurations: M A I N LOOP CONTAINING ALL ATOM-LOOPS */ for (h = g_mpi.firstconf; h < g_mpi.firstconf + g_mpi.myconf; h++) { uf = g_config.conf_uf[h - g_mpi.firstconf]; #if defined(STRESS) us = g_config.conf_us[h - g_mpi.firstconf]; #endif // STRESS /* reset energies and stresses */ forces[g_calc.energy_p + h] = 0.0; #if defined(STRESS) stresses = g_calc.stress_p + 6 * h; for (i = 0; i < 6; i++) forces[stresses + i] = 0.0; #endif // STRESS /* set limiting constraints */ forces[g_calc.limit_p + h] = -g_config.force_0[g_calc.limit_p + h]; #if defined(DIPOLE) /* reset dipoles and fields: LOOP Z E R O */ for (i = 0; i < g_config.inconf[h]; i++) { atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; atom->E_stat.x = 0.0; atom->E_stat.y = 0.0; atom->E_stat.z = 0.0; atom->p_sr.x = 0.0; atom->p_sr.y = 0.0; atom->p_sr.z = 0.0; } #endif // DIPOLE /* F I R S T LOOP OVER ATOMS: reset forces, dipoles */ for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ n_i = 3 * (g_config.cnfstart[h] + i); if (uf) { forces[n_i + 0] = -g_config.force_0[n_i + 0]; forces[n_i + 1] = -g_config.force_0[n_i + 1]; forces[n_i + 2] = -g_config.force_0[n_i + 2]; } else { forces[n_i + 0] = 0.0; forces[n_i + 1] = 0.0; forces[n_i + 2] = 0.0; } /* reset atomic density */ g_config.conf_atoms[g_config.cnfstart[h] - g_mpi.firstatom + i].rho = 0.0; } /* end F I R S T LOOP */ /* S E C O N D loop: calculate short-range and monopole forces, calculate static field- and dipole-contributions, calculate atomic densities */ for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; n_i = 3 * (g_config.cnfstart[h] + i); for (j = 0; j < atom->num_neigh; j++) { /* neighbors */ neigh = atom->neigh + j; type2 = neigh->type; col = neigh->col[0]; /* updating tail-functions - only necessary with variing kappa */ if (!apt->sw_kappa) #if defined(DSF) elstat_dsf(neigh->r, dp_kappa, &neigh->fnval_el, &neigh->grad_el, &neigh->ggrad_el); #else elstat_shift(neigh->r, dp_kappa, &neigh->fnval_el, &neigh->grad_el, &neigh->ggrad_el); #endif // DSF /* In small cells, an atom might interact with itself */ self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0; /* calculate short-range forces */ if (neigh->r < g_pot.calc_pot.end[col]) { if (uf) { fnval = splint_comb_dir(&g_pot.calc_pot, xi, neigh->slot[0], neigh->shift[0], neigh->step[0], &grad); } else { fnval = splint_dir(&g_pot.calc_pot, xi, neigh->slot[0], neigh->shift[0], neigh->step[0]); } /* avoid double counting if atom is interacting with a copy of * itself */ if (self) { fnval *= 0.5; grad *= 0.5; } forces[g_calc.energy_p + h] += fnval; if (uf) { tmp_force.x = neigh->dist_r.x * grad; tmp_force.y = neigh->dist_r.y * grad; tmp_force.z = neigh->dist_r.z * grad; forces[n_i + 0] += tmp_force.x; forces[n_i + 1] += tmp_force.y; forces[n_i + 2] += tmp_force.z; /* actio = reactio */ n_j = 3 * neigh->nr; forces[n_j + 0] -= tmp_force.x; forces[n_j + 1] -= tmp_force.y; forces[n_j + 2] -= tmp_force.z; #if defined(STRESS) /* calculate pair stresses */ if (us) { forces[stresses + 0] -= neigh->dist.x * tmp_force.x; forces[stresses + 1] -= neigh->dist.y * tmp_force.y; forces[stresses + 2] -= neigh->dist.z * tmp_force.z; forces[stresses + 3] -= neigh->dist.x * tmp_force.y; forces[stresses + 4] -= neigh->dist.y * tmp_force.z; forces[stresses + 5] -= neigh->dist.z * tmp_force.x; } #endif // STRESS } } /* calculate monopole forces */ if (neigh->r < g_config.dp_cut && (charge[type1] || charge[type2])) { fnval_tail = neigh->fnval_el; grad_tail = neigh->grad_el; grad_i = charge[type2] * grad_tail; if (type1 == type2) { grad_j = grad_i; } else { grad_j = charge[type1] * grad_tail; } fnval = charge[type1] * charge[type2] * fnval_tail; grad = charge[type1] * grad_i; if (self) { grad_i *= 0.5; grad_j *= 0.5; fnval *= 0.5; grad *= 0.5; } forces[g_calc.energy_p + h] += fnval; if (uf) { tmp_force.x = neigh->dist.x * grad; tmp_force.y = neigh->dist.y * grad; tmp_force.z = neigh->dist.z * grad; forces[n_i + 0] += tmp_force.x; forces[n_i + 1] += tmp_force.y; forces[n_i + 2] += tmp_force.z; /* actio = reactio */ n_j = 3 * neigh->nr; forces[n_j + 0] -= tmp_force.x; forces[n_j + 1] -= tmp_force.y; forces[n_j + 2] -= tmp_force.z; #if defined(STRESS) /* calculate coulomb stresses */ if (us) { forces[stresses + 0] -= neigh->dist.x * tmp_force.x; forces[stresses + 1] -= neigh->dist.y * tmp_force.y; forces[stresses + 2] -= neigh->dist.z * tmp_force.z; forces[stresses + 3] -= neigh->dist.x * tmp_force.y; forces[stresses + 4] -= neigh->dist.y * tmp_force.z; forces[stresses + 5] -= neigh->dist.z * tmp_force.x; } #endif // STRESS } #if defined(DIPOLE) /* calculate static field-contributions */ atom->E_stat.x += neigh->dist.x * grad_i; atom->E_stat.y += neigh->dist.y * grad_i; atom->E_stat.z += neigh->dist.z * grad_i; g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_stat.x -= neigh->dist.x * grad_j; g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_stat.y -= neigh->dist.y * grad_j; g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_stat.z -= neigh->dist.z * grad_j; /* calculate short-range dipoles */ if (dp_alpha[type1] && dp_b[col] && dp_c[col]) { p_sr_tail = grad_tail * neigh->r * shortrange_value(neigh->r, dp_alpha[type1], dp_b[col], dp_c[col]); atom->p_sr.x += charge[type2] * neigh->dist_r.x * p_sr_tail; atom->p_sr.y += charge[type2] * neigh->dist_r.y * p_sr_tail; atom->p_sr.z += charge[type2] * neigh->dist_r.z * p_sr_tail; } if (dp_alpha[type2] && dp_b[col] && dp_c[col] && !self) { p_sr_tail = grad_tail * neigh->r * shortrange_value(neigh->r, dp_alpha[type2], dp_b[col], dp_c[col]); g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_sr.x -= charge[type1] * neigh->dist_r.x * p_sr_tail; g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_sr.y -= charge[type1] * neigh->dist_r.y * p_sr_tail; g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_sr.z -= charge[type1] * neigh->dist_r.z * p_sr_tail; } #endif // DIPOLE } /* calculate atomic densities */ if (atom->type == neigh->type) { /* then transfer(a->b)==transfer(b->a) */ if (neigh->r < g_pot.calc_pot.end[neigh->col[1]]) { rho_val = splint_dir(&g_pot.calc_pot, xi, neigh->slot[1], neigh->shift[1], neigh->step[1]); atom->rho += rho_val; /* avoid double counting if atom is interacting with a copy of itself */ if (!self) { g_config.conf_atoms[neigh->nr - g_mpi.firstatom].rho += rho_val; } } } else { /* transfer(a->b)!=transfer(b->a) */ if (neigh->r < g_pot.calc_pot.end[neigh->col[1]]) { atom->rho += splint_dir(&g_pot.calc_pot, xi, neigh->slot[1], neigh->shift[1], neigh->step[1]); } /* cannot use slot/shift to access splines */ if (neigh->r < g_pot.calc_pot.end[g_calc.paircol + atom->type]) g_config.conf_atoms[neigh->nr - g_mpi.firstatom].rho += (*g_splint)(&g_pot.calc_pot, xi, g_calc.paircol + atom->type, neigh->r); } } /* loop over neighbours */ col_F = g_calc.paircol + g_param.ntypes + atom->type; /* column of F */ if (atom->rho > g_pot.calc_pot.end[col_F]) { /* then punish target function -> bad potential */ forces[g_calc.limit_p + h] += DUMMY_WEIGHT * 10.0 * dsquare(atom->rho - g_pot.calc_pot.end[col_F]); atom->rho = g_pot.calc_pot.end[col_F]; } if (atom->rho < g_pot.calc_pot.begin[col_F]) { /* then punish target function -> bad potential */ forces[g_calc.limit_p + h] += DUMMY_WEIGHT * 10.0 * dsquare(g_pot.calc_pot.begin[col_F] - atom->rho); atom->rho = g_pot.calc_pot.begin[col_F]; } /* embedding energy, embedding gradient */ /* contribution to cohesive energy is F(n) */ #if defined(NORESCALE) if (atom->rho < g_pot.calc_pot.begin[col_F]) { /* linear extrapolation left */ rho_val = splint_comb(&calc_pot, xi, col_F, g_pot.calc_pot.begin[col_F], &atom->gradF); forces[energy_p + h] += rho_val + (atom->rho - g_pot.calc_pot.begin[col_F]) * atom->gradF; #if defined(APOT) forces[limit_p + h] += DUMMY_WEIGHT * 10.0 * dsquare(calc_pot.begin[col_F] - atom->rho); #endif // APOT } else if (atom->rho > g_pot.calc_pot.end[col_F]) { /* and right */ rho_val = splint_comb( &calc_pot, xi, col_F, g_pot.calc_pot.end[col_F] - 0.5 * g_pot.calc_pot.step[col_F], &atom->gradF); forces[energy_p + h] += rho_val + (atom->rho - g_pot.calc_pot.end[col_F]) * atom->gradF; #if defined(APOT) forces[limit_p + h] += DUMMY_WEIGHT * 10.0 * dsquare(atom->rho - g_pot.calc_pot.end[col_F]); #endif // APOT } /* and in-between */ else { forces[energy_p + h] += splint_comb(&calc_pot, xi, col_F, atom->rho, &atom->gradF); } #else forces[g_calc.energy_p + h] += (*g_splint_comb)( &g_pot.calc_pot, xi, col_F, atom->rho, &atom->gradF); #endif // NORESCALE /* sum up rho */ rho_sum_loc += atom->rho; } /* end S E C O N D loop over atoms */ #if defined(DIPOLE) /* T H I R D loop: calculate whole dipole moment for every atom */ double rp, dp_sum; int dp_converged = 0, dp_it = 0; double max_diff = 10; while (dp_converged == 0) { dp_sum = 0; for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; if (dp_alpha[type1]) { if (dp_it) { /* note: mixing parameter is different from that on in IMD */ atom->E_tot.x = (1 - g_config.dp_mix) * atom->E_ind.x + g_config.dp_mix * atom->E_old.x + atom->E_stat.x; atom->E_tot.y = (1 - g_config.dp_mix) * atom->E_ind.y + g_config.dp_mix * atom->E_old.y + atom->E_stat.y; atom->E_tot.z = (1 - g_config.dp_mix) * atom->E_ind.z + g_config.dp_mix * atom->E_old.z + atom->E_stat.z; } else { atom->E_tot.x = atom->E_ind.x + atom->E_stat.x; atom->E_tot.y = atom->E_ind.y + atom->E_stat.y; atom->E_tot.z = atom->E_ind.z + atom->E_stat.z; } atom->p_ind.x = dp_alpha[type1] * atom->E_tot.x + atom->p_sr.x; atom->p_ind.y = dp_alpha[type1] * atom->E_tot.y + atom->p_sr.y; atom->p_ind.z = dp_alpha[type1] * atom->E_tot.z + atom->p_sr.z; atom->E_old.x = atom->E_ind.x; atom->E_old.y = atom->E_ind.y; atom->E_old.z = atom->E_ind.z; atom->E_ind.x = 0.0; atom->E_ind.y = 0.0; atom->E_ind.z = 0.0; } } for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; for (j = 0; j < atom->num_neigh; j++) { /* neighbors */ neigh = atom->neigh + j; type2 = neigh->type; col = neigh->col[0]; /* In small cells, an atom might interact with itself */ self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0; if (neigh->r < g_config.dp_cut && dp_alpha[type1] && dp_alpha[type2]) { rp = SPROD( g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind, neigh->dist_r); atom->E_ind.x += neigh->grad_el * (3 * rp * neigh->dist_r.x - g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.x); atom->E_ind.y += neigh->grad_el * (3 * rp * neigh->dist_r.y - g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.y); atom->E_ind.z += neigh->grad_el * (3 * rp * neigh->dist_r.z - g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.z); if (!self) { rp = SPROD(atom->p_ind, neigh->dist_r); g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_ind.x += neigh->grad_el * (3 * rp * neigh->dist_r.x - atom->p_ind.x); g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_ind.y += neigh->grad_el * (3 * rp * neigh->dist_r.y - atom->p_ind.y); g_config.conf_atoms[neigh->nr - g_mpi.firstatom].E_ind.z += neigh->grad_el * (3 * rp * neigh->dist_r.z - atom->p_ind.z); } } } } for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; if (dp_alpha[type1]) { dp_sum += dsquare(dp_alpha[type1] * (atom->E_old.x - atom->E_ind.x)); dp_sum += dsquare(dp_alpha[type1] * (atom->E_old.y - atom->E_ind.y)); dp_sum += dsquare(dp_alpha[type1] * (atom->E_old.z - atom->E_ind.z)); } } dp_sum /= 3 * g_config.inconf[h]; dp_sum = sqrt(dp_sum); if (dp_it) { if ((dp_sum > max_diff) || (dp_it > 50)) { dp_converged = 1; for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; if (dp_alpha[type1]) { atom->p_ind.x = dp_alpha[type1] * atom->E_stat.x + atom->p_sr.x; atom->p_ind.y = dp_alpha[type1] * atom->E_stat.y + atom->p_sr.y; atom->p_ind.z = dp_alpha[type1] * atom->E_stat.z + atom->p_sr.z; atom->E_ind.x = atom->E_stat.x; atom->E_ind.y = atom->E_stat.y; atom->E_ind.z = atom->E_stat.z; } } } } if (dp_sum < g_config.dp_tol) dp_converged = 1; dp_it++; } /* end T H I R D loop over atoms */ /* F O U R T H loop: calculate monopole-dipole and dipole-dipole forces */ double rp_i, rp_j, pp_ij, tmp_1, tmp_2; double grad_1, grad_2, srval, srgrad, srval_tail, srgrad_tail, fnval_sum, grad_sum; for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; n_i = 3 * (g_config.cnfstart[h] + i); for (j = 0; j < atom->num_neigh; j++) { /* neighbors */ neigh = atom->neigh + j; type2 = neigh->type; col = neigh->col[0]; /* In small cells, an atom might interact with itself */ self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0; if (neigh->r < g_config.dp_cut && (dp_alpha[type1] || dp_alpha[type2])) { fnval_tail = -neigh->grad_el; grad_tail = -neigh->ggrad_el; if (dp_b[col] && dp_c[col]) { shortrange_term(neigh->r, dp_b[col], dp_c[col], &srval_tail, &srgrad_tail); srval = fnval_tail * srval_tail; srgrad = fnval_tail * srgrad_tail + grad_tail * srval_tail; } if (self) { fnval_tail *= 0.5; grad_tail *= 0.5; } /* monopole-dipole contributions */ if (charge[type1] && dp_alpha[type2]) { if (dp_b[col] && dp_c[col]) { fnval_sum = fnval_tail + srval; grad_sum = grad_tail + srgrad; } else { fnval_sum = fnval_tail; grad_sum = grad_tail; } rp_j = SPROD( g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind, neigh->dist_r); fnval = charge[type1] * rp_j * fnval_sum * neigh->r; grad_1 = charge[type1] * rp_j * grad_sum * neigh->r2; grad_2 = charge[type1] * fnval_sum; forces[g_calc.energy_p + h] -= fnval; if (uf) { tmp_force.x = neigh->dist_r.x * grad_1 + g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.x * grad_2; tmp_force.y = neigh->dist_r.y * grad_1 + g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.y * grad_2; tmp_force.z = neigh->dist_r.z * grad_1 + g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind.z * grad_2; forces[n_i + 0] -= tmp_force.x; forces[n_i + 1] -= tmp_force.y; forces[n_i + 2] -= tmp_force.z; /* actio = reactio */ n_j = 3 * neigh->nr; forces[n_j + 0] += tmp_force.x; forces[n_j + 1] += tmp_force.y; forces[n_j + 2] += tmp_force.z; #if defined(STRESS) /* calculate stresses */ if (us) { forces[stresses + 0] += neigh->dist.x * tmp_force.x; forces[stresses + 1] += neigh->dist.y * tmp_force.y; forces[stresses + 2] += neigh->dist.z * tmp_force.z; forces[stresses + 3] += neigh->dist.x * tmp_force.y; forces[stresses + 4] += neigh->dist.y * tmp_force.z; forces[stresses + 5] += neigh->dist.z * tmp_force.x; } #endif // STRESS } } /* dipole-monopole contributions */ if (dp_alpha[type2] && charge[type2]) { if (dp_b[col] && dp_c[col]) { fnval_sum = fnval_tail + srval; grad_sum = grad_tail + srgrad; } else { fnval_sum = fnval_tail; grad_sum = grad_tail; } rp_i = SPROD(atom->p_ind, neigh->dist_r); fnval = charge[type2] * rp_i * fnval_sum * neigh->r; grad_1 = charge[type2] * rp_i * grad_sum * neigh->r2; grad_2 = charge[type2] * fnval_sum; forces[g_calc.energy_p + h] += fnval; if (uf) { tmp_force.x = neigh->dist_r.x * grad_1 + atom->p_ind.x * grad_2; tmp_force.y = neigh->dist_r.y * grad_1 + atom->p_ind.y * grad_2; tmp_force.z = neigh->dist_r.z * grad_1 + atom->p_ind.z * grad_2; forces[n_i + 0] += tmp_force.x; forces[n_i + 1] += tmp_force.y; forces[n_i + 2] += tmp_force.z; /* actio = reactio */ n_j = 3 * neigh->nr; forces[n_j + 0] -= tmp_force.x; forces[n_j + 1] -= tmp_force.y; forces[n_j + 2] -= tmp_force.z; #if defined(STRESS) /* calculate stresses */ if (us) { forces[stresses + 0] -= neigh->dist.x * tmp_force.x; forces[stresses + 1] -= neigh->dist.y * tmp_force.y; forces[stresses + 2] -= neigh->dist.z * tmp_force.z; forces[stresses + 3] -= neigh->dist.x * tmp_force.y; forces[stresses + 4] -= neigh->dist.y * tmp_force.z; forces[stresses + 5] -= neigh->dist.z * tmp_force.x; } #endif // STRESS } } /* dipole-dipole contributions */ if (dp_alpha[type1] && dp_alpha[type2]) { pp_ij = SPROD( atom->p_ind, g_config.conf_atoms[neigh->nr - g_mpi.firstatom].p_ind); tmp_1 = 3 * rp_i * rp_j; tmp_2 = 3 * fnval_tail / neigh->r2; fnval = -(tmp_1 - pp_ij) * fnval_tail; grad_1 = (tmp_1 - pp_ij) * grad_tail; grad_2 = 2 * rp_i * rp_j; forces[g_calc.energy_p + h] += fnval; if (uf) { tmp_force.x = grad_1 * neigh->dist.x - tmp_2 * (grad_2 * neigh->dist.x - rp_i * neigh->r * g_config.conf_atoms[neigh->nr - g_mpi.firstatom] .p_ind.x - rp_j * neigh->r * atom->p_ind.x); tmp_force.y = grad_1 * neigh->dist.y - tmp_2 * (grad_2 * neigh->dist.y - rp_i * neigh->r * g_config.conf_atoms[neigh->nr - g_mpi.firstatom] .p_ind.y - rp_j * neigh->r * atom->p_ind.y); tmp_force.z = grad_1 * neigh->dist.z - tmp_2 * (grad_2 * neigh->dist.z - rp_i * neigh->r * g_config.conf_atoms[neigh->nr - g_mpi.firstatom] .p_ind.z - rp_j * neigh->r * atom->p_ind.z); forces[n_i + 0] -= tmp_force.x; forces[n_i + 1] -= tmp_force.y; forces[n_i + 2] -= tmp_force.z; /* actio = reactio */ n_j = 3 * neigh->nr; forces[n_j + 0] += tmp_force.x; forces[n_j + 1] += tmp_force.y; forces[n_j + 2] += tmp_force.z; #if defined(STRESS) /* calculate stresses */ if (us) { forces[stresses + 0] += neigh->dist.x * tmp_force.x; forces[stresses + 1] += neigh->dist.y * tmp_force.y; forces[stresses + 2] += neigh->dist.z * tmp_force.z; forces[stresses + 3] += neigh->dist.x * tmp_force.y; forces[stresses + 4] += neigh->dist.y * tmp_force.z; forces[stresses + 5] += neigh->dist.z * tmp_force.x; } #endif // STRESS } } } } /* loop over neighbours */ } /* end F O U R T H loop over atoms */ #endif // DIPOLE /* F I F T H loop: self energy contributions and sum-up force * contributions */ double qq; #if defined(DSF) double fnval_cut, gtail_cut, ggrad_cut; elstat_value(g_config.dp_cut, dp_kappa, &fnval_cut, >ail_cut, &ggrad_cut); #endif // DSF for (i = 0; i < g_config.inconf[h]; i++) { /* atoms */ atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; type1 = atom->type; n_i = 3 * (g_config.cnfstart[h] + i); /* self energy contributions */ if (charge[type1]) { qq = charge[type1] * charge[type1]; #if defined(DSF) fnval = qq * ( DP_EPS * dp_kappa / sqrt(M_PI) + (fnval_cut - gtail_cut * g_config.dp_cut * g_config.dp_cut )*0.5 ); #else fnval = DP_EPS * dp_kappa * qq / sqrt(M_PI); #endif // DSF forces[g_calc.energy_p + h] -= fnval; } #if defined(DIPOLE) double pp; if (dp_alpha[type1]) { pp = SPROD(atom->p_ind, atom->p_ind); fnval = pp / (2 * dp_alpha[type1]); forces[g_calc.energy_p + h] += fnval; } /* alternative dipole self energy including kappa-dependence */ // if (dp_alpha[type1]) { // pp = SPROD(atom->p_ind, atom->p_ind); // fnval = kkk * pp / sqrt(M_PI); // forces[energy_p + h] += fnval; //} #endif // DIPOLE /* sum-up: whole force contributions flow into tmpsum */ /* if (uf) {*/ /*#ifdef FWEIGHT*/ /* Weigh by absolute value of force */ /* forces[k] /= FORCE_EPS + atom->absforce;*/ /* forces[k + 1] /= FORCE_EPS + atom->absforce;*/ /* forces[k + 2] /= FORCE_EPS + atom->absforce;*/ /*#endif |+ FWEIGHT +|*/ /*#ifdef CONTRIB*/ /* if (atom->contrib)*/ /*#endif |+ CONTRIB +|*/ /* tmpsum +=*/ /* conf_weight[h] * (dsquare(forces[k]) + * dsquare(forces[k + 1]) + dsquare(forces[k + 2]));*/ /* printf("tmpsum = %f (forces)\n",tmpsum);*/ /* }*/ } /* end F I F T H loop over atoms */ /* S I X T H loop: EAM force */ if (uf) { /* only required if we calc forces */ for (i = 0; i < g_config.inconf[h]; i++) { atom = g_config.conf_atoms + i + g_config.cnfstart[h] - g_mpi.firstatom; n_i = 3 * (g_config.cnfstart[h] + i); for (j = 0; j < atom->num_neigh; j++) { /* loop over neighbors */ neigh = atom->neigh + j; /* In small cells, an atom might interact with itself */ self = (neigh->nr == i + g_config.cnfstart[h]) ? 1 : 0; col_F = g_calc.paircol + g_param.ntypes + atom->type; /* column of F */ r = neigh->r; /* are we within reach? */ if ((r < g_pot.calc_pot.end[neigh->col[1]]) || (r < g_pot.calc_pot.end[col_F - g_param.ntypes])) { rho_grad = (r < g_pot.calc_pot.end[neigh->col[1]]) ? splint_grad_dir(&g_pot.calc_pot, xi, neigh->slot[1], neigh->shift[1], neigh->step[1]) : 0.0; if (atom->type == neigh->type) /* use actio = reactio */ rho_grad_j = rho_grad; else rho_grad_j = (r < g_pot.calc_pot.end[col_F - g_param.ntypes]) ? (*g_splint_grad)(&g_pot.calc_pot, xi, col_F - g_param.ntypes, r) : 0.0; /* now we know everything - calculate forces */ eam_force = (rho_grad * atom->gradF + rho_grad_j * g_config.conf_atoms[(neigh->nr) - g_mpi.firstatom] .gradF); /* avoid double counting if atom is interacting with a copy of itself */ if (self) eam_force *= 0.5; tmp_force.x = neigh->dist_r.x * eam_force; tmp_force.y = neigh->dist_r.y * eam_force; tmp_force.z = neigh->dist_r.z * eam_force; forces[n_i + 0] += tmp_force.x; forces[n_i + 1] += tmp_force.y; forces[n_i + 2] += tmp_force.z; /* actio = reactio */ n_j = 3 * neigh->nr; forces[n_j + 0] -= tmp_force.x; forces[n_j + 1] -= tmp_force.y; forces[n_j + 2] -= tmp_force.z; #if defined(STRESS) /* and stresses */ if (us) { forces[stresses + 0] -= neigh->dist.x * tmp_force.x; forces[stresses + 1] -= neigh->dist.y * tmp_force.y; forces[stresses + 2] -= neigh->dist.z * tmp_force.z; forces[stresses + 3] -= neigh->dist.x * tmp_force.y; forces[stresses + 4] -= neigh->dist.y * tmp_force.z; forces[stresses + 5] -= neigh->dist.z * tmp_force.x; } #endif // STRESS } /* within reach */ } /* loop over neighbours */ #if defined(FWEIGHT) /* Weigh by absolute value of force */ forces[n_i + 0] /= FORCE_EPS + atom->absforce; forces[n_i + 1] /= FORCE_EPS + atom->absforce; forces[n_i + 2] /= FORCE_EPS + atom->absforce; #endif // FWEIGHT /* sum up forces */ #if defined(CONTRIB) if (atom->contrib) #endif // CONTRIB tmpsum += g_config.conf_weight[h] * (dsquare(forces[n_i + 0]) + dsquare(forces[n_i + 1]) + dsquare(forces[n_i + 2])); } } /* end S I X T H loop over atoms */ /* whole energy contributions flow into tmpsum */ forces[g_calc.energy_p + h] /= (double)g_config.inconf[h]; forces[g_calc.energy_p + h] -= g_config.force_0[g_calc.energy_p + h]; tmpsum += g_config.conf_weight[h] * g_param.eweight * dsquare(forces[g_calc.energy_p + h]); #if defined(STRESS) /* whole stress contributions flow into tmpsum */ if (uf && us) { for (i = 0; i < 6; i++) { forces[stresses + i] /= g_config.conf_vol[h - g_mpi.firstconf]; forces[stresses + i] -= g_config.force_0[stresses + i]; tmpsum += g_config.conf_weight[h] * g_param.sweight * dsquare(forces[stresses + i]); } } #endif // STRESS /* limiting constraints per configuration */ tmpsum += g_config.conf_weight[h] * dsquare(forces[g_calc.limit_p + h]); } /* end M A I N loop over configurations */ } /* parallel region */ #if defined(MPI) /* Reduce rho_sum */ MPI_Reduce(&rho_sum_loc, &rho_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); #else /* MPI */ rho_sum = rho_sum_loc; #endif // MPI /* dummy constraints (global) */ #if defined(APOT) /* add punishment for out of bounds (mostly for powell_lsq) */ if (g_mpi.myid == 0) { tmpsum += apot_punish(xi_opt, forces); } #endif // APOT #if !defined(NOPUNISH) if (g_mpi.myid == 0) { int g; for (g = 0; g < g_param.ntypes; g++) { #if defined(NORESCALE) /* clear field */ forces[g_calc.dummy_p + g_param.ntypes + g] = 0.0; /* Free end... */ /* NEW: Constraint on U': U'(1.0)=0.0; */ forces[g_calc.dummy_p + g] = DUMMY_WEIGHT * splint_grad(&calc_pot, xi, paircol + g_param.ntypes + g, 1.0); #else /* NOTHING */ forces[g_calc.dummy_p + g_param.ntypes + g] = 0.0; /* Free end... */ /* constraints on U`(n) */ forces[g_calc.dummy_p + g] = DUMMY_WEIGHT * (*g_splint_grad)( &g_pot.calc_pot, xi, g_calc.paircol + g_param.ntypes + g, 0.5 * (g_pot.calc_pot .begin[g_calc.paircol + g_param.ntypes + g] + g_pot.calc_pot .end[g_calc.paircol + g_param.ntypes + g])) - g_config.force_0[g_calc.dummy_p + g]; #endif // NORESCALE tmpsum += dsquare(forces[g_calc.dummy_p + g_param.ntypes + g]); tmpsum += dsquare(forces[g_calc.dummy_p + g]); } /* loop over types */ #if defined(NORESCALE) /* NEW: Constraint on n: <n>=1.0 ONE CONSTRAINT ONLY */ /* Calculate averages */ rho_sum /= (double)natoms; /* ATTN: if there are invariant potentials, things might be problematic */ forces[dummy_p + g_param.ntypes] = DUMMY_WEIGHT * (rho_sum - 1.0); tmpsum += dsquare(forces[dummy_p + g_param.ntypes]); #endif // NORESCALE } #endif // NOPUNISH #if defined(MPI) /* reduce global sum */ sum = 0.0; MPI_Reduce(&tmpsum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); /* gather forces, energies, stresses */ if (g_mpi.myid == 0) { /* root node already has data in place */ /* forces */ MPI_Gatherv(MPI_IN_PLACE, g_mpi.myatoms, g_mpi.MPI_VECTOR, forces, g_mpi.atom_len, g_mpi.atom_dist, g_mpi.MPI_VECTOR, 0, MPI_COMM_WORLD); /* energies */ MPI_Gatherv(MPI_IN_PLACE, g_mpi.myconf, MPI_DOUBLE, forces + g_calc.energy_p, g_mpi.conf_len, g_mpi.conf_dist, MPI_DOUBLE, 0, MPI_COMM_WORLD); #if defined(STRESS) /* stresses */ MPI_Gatherv(MPI_IN_PLACE, g_mpi.myconf, g_mpi.MPI_STENS, forces + g_calc.stress_p, g_mpi.conf_len, g_mpi.conf_dist, g_mpi.MPI_STENS, 0, MPI_COMM_WORLD); #endif // STRESS #if !defined(NORESCALE) /* punishment constraints */ MPI_Gatherv(MPI_IN_PLACE, g_mpi.myconf, MPI_DOUBLE, forces + g_calc.limit_p, g_mpi.conf_len, g_mpi.conf_dist, MPI_DOUBLE, 0, MPI_COMM_WORLD); #endif // !NORESCALE } else { /* forces */ MPI_Gatherv(forces + g_mpi.firstatom * 3, g_mpi.myatoms, g_mpi.MPI_VECTOR, forces, g_mpi.atom_len, g_mpi.atom_dist, g_mpi.MPI_VECTOR, 0, MPI_COMM_WORLD); /* energies */ MPI_Gatherv(forces + g_calc.energy_p + g_mpi.firstconf, g_mpi.myconf, MPI_DOUBLE, forces + g_calc.energy_p, g_mpi.conf_len, g_mpi.conf_dist, MPI_DOUBLE, 0, MPI_COMM_WORLD); #if defined(STRESS) /* stresses */ MPI_Gatherv(forces + g_calc.stress_p + 6 * g_mpi.firstconf, g_mpi.myconf, g_mpi.MPI_STENS, forces + g_calc.stress_p, g_mpi.conf_len, g_mpi.conf_dist, g_mpi.MPI_STENS, 0, MPI_COMM_WORLD); #endif // STRESS #if !defined(NORESCALE) /* punishment constraints */ MPI_Gatherv(forces + g_calc.limit_p + g_mpi.firstconf, g_mpi.myconf, MPI_DOUBLE, forces + g_calc.limit_p, g_mpi.conf_len, g_mpi.conf_dist, MPI_DOUBLE, 0, MPI_COMM_WORLD); #endif // !NORESCALE } /* no need to pick up dummy constraints - they are already @ root */ #else sum = tmpsum; /* global sum = local sum */ #endif // MPI /* root process exits this function now */ if (g_mpi.myid == 0) { g_calc.fcalls++; /* Increase function call counter */ if (isnan(sum)) { #if defined(DEBUG) printf("\n--> Force is nan! <--\n\n"); #endif // DEBUG return 10e10; } else return sum; } } /* once a non-root process arrives here, all is done. */ return -1.0; }
void EXX_Fock_Band( dcomplex **H, EXX_t *exx, dcomplex ****exx_CDM, int *MP, double k1, double k2, double k3, int spin ) { double *local_H, *mpi_H; int H_n, nbuf; int i, j, n, ir, nr, irn, nep; int iop1, iop2, nb1, nb2, nb3, nb4, nrn; int ia1, ia2, ia3, ia4, ib1, ib2, ib3, ib4, icell1, icell2; int ib1_ep, ib2_ep, ib3_ep, ib4_ep; int nb1_ep, nb2_ep, nb3_ep, nb4_ep; const int *ep_atom1, *ep_atom2, *ep_cell, *atom_nb; const int *op_atom1, *op_atom2; int iep1[8], iep2[8], icell3[8], mul; double w; int GA_AN, Anum, GB_AN, Bnum, tnoA, tnoB; int neri; double *eri_list; int *iRm; double eri; dcomplex den; int nproc, myid, iproc; MPI_Status stat; int ncd, nshell_ep; int iRn_x, iRn_y, iRn_z, iRmp_x, iRmp_y, iRmp_z; double kRn, kRmp, co1, si1, co2, si2; MPI_Comm comm; double *k1_list, *k2_list, *k3_list; int *spin_list; comm = g_exx_mpicomm; MPI_Comm_size(comm, &nproc); MPI_Comm_rank(comm, &myid); k1_list = (double*)malloc(sizeof(double)*nproc); k2_list = (double*)malloc(sizeof(double)*nproc); k3_list = (double*)malloc(sizeof(double)*nproc); spin_list = (int*)malloc(sizeof(int)*nproc); /* all-to-all */ MPI_Allgather(&k1, 1, MPI_DOUBLE, k1_list, 1, MPI_DOUBLE, comm); MPI_Allgather(&k2, 1, MPI_DOUBLE, k2_list, 1, MPI_DOUBLE, comm); MPI_Allgather(&k3, 1, MPI_DOUBLE, k3_list, 1, MPI_DOUBLE, comm); MPI_Allgather(&spin, 1, MPI_INT, spin_list, 1, MPI_INT, comm); op_atom1 = EXX_Array_OP_Atom1(exx); op_atom2 = EXX_Array_OP_Atom2(exx); ep_atom1 = EXX_Array_EP_Atom1(exx); ep_atom2 = EXX_Array_EP_Atom2(exx); ep_cell = EXX_Array_EP_Cell(exx); atom_nb = EXX_atom_nb(exx); nshell_ep = EXX_Number_of_EP_Shells(exx); ncd = 2*nshell_ep+1; /* matrix size */ H_n = 0; for (i=1; i<=atomnum; i++){ H_n += Spe_Total_CNO[WhatSpecies[i]]; } /* allocation */ nbuf = H_n*H_n*2; local_H = (double*)malloc(sizeof(double)*nbuf); mpi_H = (double*)malloc(sizeof(double)*nbuf); nr = EXX_File_ERI_Read_NRecord(exx); /* clear buffer */ for (i=0; i<nbuf; i++) { local_H[i] = 0.0; } for (ir=0; ir<nr; ir++) { EXX_File_ERI_Read_Data_Head(exx, ir, &iop1, &iop2, &nb1, &nb2, &nb3, &nb4, &nrn); neri = nb1*nb2*nb3*nb4*nrn; eri_list = (double*)malloc(sizeof(double)*neri); iRm = (int*)malloc(sizeof(int)*nrn); /* Rm */ EXX_File_ERI_Read(exx, ir, eri_list, iRm, iop1, iop2, nrn, neri); for (iproc=0; iproc<nproc; iproc++) { /* clear buffer */ for (i=0; i<nbuf; i++) { mpi_H[i] = 0.0; } k1 = k1_list[iproc]; k2 = k2_list[iproc]; k3 = k3_list[iproc]; spin = spin_list[iproc]; for (irn=0; irn<nrn; irn++) { EXX_OP2EP_Band(exx, iop1, iop2, iRm[irn], &iep1[0], &iep2[0], &icell3[0], &mul); w = 1.0/(double)mul; for (j=0; j<8; j++) { ia1 = ep_atom1[iep1[j]]; /* i */ ia2 = ep_atom2[iep1[j]]; /* j */ ia3 = ep_atom1[iep2[j]]; /* k */ ia4 = ep_atom2[iep2[j]]; /* l */ icell1 = ep_cell[iep1[j]]; /* Rn */ icell2 = ep_cell[iep2[j]]; /* Rm' */ nb1_ep = atom_nb[ia1]; nb2_ep = atom_nb[ia2]; nb3_ep = atom_nb[ia3]; nb4_ep = atom_nb[ia4]; GA_AN = ia1+1; Anum = MP[GA_AN]; GB_AN = ia2+1; Bnum = MP[GB_AN]; /* phase */ iRn_x = icell1%ncd - nshell_ep; iRn_y = (icell1/ncd)%ncd - nshell_ep; iRn_z = (icell1/ncd/ncd)%ncd - nshell_ep; kRn = k1*(double)iRn_x + k2*(double)iRn_y + k3*(double)iRn_z; si1 = sin(2.0*PI*kRn); co1 = cos(2.0*PI*kRn); iRmp_x = icell2%ncd - nshell_ep; iRmp_y = (icell2/ncd)%ncd - nshell_ep; iRmp_z = (icell2/ncd/ncd)%ncd - nshell_ep; kRmp = k1*(double)iRmp_x + k2*(double)iRmp_y + k3*(double)iRmp_z; si2 = sin(2.0*PI*kRmp); co2 = cos(2.0*PI*kRmp); for (ib1_ep=0; ib1_ep<nb1_ep; ib1_ep++) { for (ib2_ep=0; ib2_ep<nb2_ep; ib2_ep++) { for (ib3_ep=0; ib3_ep<nb3_ep; ib3_ep++) { for (ib4_ep=0; ib4_ep<nb4_ep; ib4_ep++) { EXX_Basis_Index(ib1_ep, ib2_ep, ib3_ep, ib4_ep, &ib1, &ib2, &ib3, &ib4, j); i = (((ib1*nb2+ib2)*nb3+ib3)*nb4+ib4)*nrn+irn; eri = eri_list[i]; den.r = exx_CDM[spin][iep2[j]][ib3_ep][ib4_ep].r; den.i = exx_CDM[spin][iep2[j]][ib3_ep][ib4_ep].i; i = (Anum+ib1_ep-1)*H_n + (Bnum+ib2_ep-1); mpi_H[2*i+0] += -w*eri*(co1*den.r - si1*den.i); /* real */ mpi_H[2*i+1] += -w*eri*(co1*den.i + si1*den.r); /* imag */ } } } } } /* loop of j */ } /* loop of irn */ /* reduce to the iproc-th proc */ MPI_Reduce(mpi_H, mpi_H, nbuf, MPI_DOUBLE, MPI_SUM, iproc, comm); if (myid==iproc) { for (i=0; i<nbuf; i++) { local_H[i] = mpi_H[i]; } } } /* loop of iproc */ free(eri_list); free(iRm); } /* loop of irn */ #if 1 for (GA_AN=1; GA_AN<=atomnum; GA_AN++) { tnoA = Spe_Total_CNO[WhatSpecies[GA_AN]]; Anum = MP[GA_AN]; for (GB_AN=1; GB_AN<=atomnum; GB_AN++) { tnoB = Spe_Total_CNO[WhatSpecies[GB_AN]]; Bnum = MP[GB_AN]; for (ib1=0; ib1<tnoA; ib1++) { for (ib2=0; ib2<tnoB; ib2++) { i = (Anum+ib1-1)*H_n + (Bnum+ib2-1); H[Anum+ib1][Bnum+ib2].r += local_H[2*i+0]; H[Anum+ib1][Bnum+ib2].i += local_H[2*i+1]; } } } } #endif /* free*/ free(local_H); free(mpi_H); }
void step3_gpu(int *n) { int nprocs, procid; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Create Cartesian Communicator */ int c_dims[2]={0}; MPI_Comm c_comm; accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm); Complexf *data, *data_cpu; Complexf *data_hat; double f_time=0*MPI_Wtime(),i_time=0, setup_time=0; int alloc_max=0; int isize[3],osize[3],istart[3],ostart[3]; /* Get the local pencil size and the allocation size */ alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm); #ifdef INPLACE data_cpu=(Complexf*)malloc(alloc_max); cudaMalloc((void**) &data, alloc_max); #else data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float)); cudaMalloc((void**) &data_hat, alloc_max); #endif //accfft_init(nthreads); setup_time=-MPI_Wtime(); /* Create FFT plan */ #ifdef INPLACE accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE); #else accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE); #endif setup_time+=MPI_Wtime(); /* Warmup Runs */ #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); #endif /* Initialize data */ initialize(data_cpu,n,c_comm); #ifdef INPLACE cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice); #else cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice); #endif MPI_Barrier(c_comm); /* Perform forward FFT */ f_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat); #endif f_time+=MPI_Wtime(); MPI_Barrier(c_comm); #ifndef INPLACE Complexf *data2_cpu, *data2; cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float)); data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float)); #endif /* Perform backward FFT */ i_time-=MPI_Wtime(); #ifdef INPLACE accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data); #else accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2); #endif i_time+=MPI_Wtime(); /* copy back results on CPU and check error*/ #ifdef INPLACE cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost); check_err(data_cpu,n,c_comm); #else cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost); check_err(data2_cpu,n,c_comm); #endif /* Compute some timings statistics */ double g_f_time, g_i_time, g_setup_time; MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); MPI_Reduce(&setup_time,&g_setup_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD); #ifdef INPLACE PCOUT<<"GPU Timing for Inplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #else PCOUT<<"GPU Timing for Outplace FFT of size "<<n[0]<<"*"<<n[1]<<"*"<<n[2]<<std::endl; #endif PCOUT<<"Setup \t"<<g_setup_time<<std::endl; PCOUT<<"FFT \t"<<g_f_time<<std::endl; PCOUT<<"IFFT \t"<<g_i_time<<std::endl; MPI_Barrier(c_comm); cudaDeviceSynchronize(); free(data_cpu); cudaFree(data); #ifndef INPLACE cudaFree(data_hat); free(data2_cpu); cudaFree(data2); #endif accfft_destroy_plan_gpu(plan); accfft_cleanup_gpuf(); MPI_Comm_free(&c_comm); return ; } // end step3_gpu
/* * This test looks at the handling of logical and for types that are not * integers or are not required integers (e.g., long long). MPICH allows * these as well. A strict MPI test should not include this test. */ int main( int argc, char *argv[] ) { int errs = 0; int rc; int rank, size; MPI_Comm comm; char cinbuf[3], coutbuf[3]; signed char scinbuf[3], scoutbuf[3]; unsigned char ucinbuf[3], ucoutbuf[3]; short sinbuf[3], soutbuf[3]; unsigned short usinbuf[3], usoutbuf[3]; long linbuf[3], loutbuf[3]; unsigned long ulinbuf[3], uloutbuf[3]; unsigned uinbuf[3], uoutbuf[3]; int iinbuf[3], ioutbuf[3]; MTest_Init( &argc, &argv ); comm = MPI_COMM_WORLD; /* Set errors return so that we can provide better information should a routine reject one of the operand/datatype pairs */ MPI_Errhandler_set( comm, MPI_ERRORS_RETURN ); MPI_Comm_rank( comm, &rank ); MPI_Comm_size( comm, &size ); #ifndef USE_STRICT_MPI /* char */ MTestPrintfMsg( 10, "Reduce of MPI_CHAR\n" ); cinbuf[0] = 0xff; cinbuf[1] = 0; cinbuf[2] = (rank > 0) ? 0x3c : 0xc3; coutbuf[0] = 0xf; coutbuf[1] = 1; coutbuf[2] = 1; rc = MPI_Reduce( cinbuf, coutbuf, 3, MPI_CHAR, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_CHAR", rc ); errs++; } else { if (rank == 0) { if (coutbuf[0] != ((size % 2) ? (char)0xff : (char)0) ) { errs++; fprintf( stderr, "char BXOR(1) test failed\n" ); } if (coutbuf[1]) { errs++; fprintf( stderr, "char BXOR(0) test failed\n" ); } if (coutbuf[2] != ((size % 2) ? (char)0xc3 : (char)0xff)) { errs++; fprintf( stderr, "char BXOR(>) test failed\n" ); } } } #endif /* USE_STRICT_MPI */ /* signed char */ MTestPrintfMsg( 10, "Reduce of MPI_SIGNED_CHAR\n" ); scinbuf[0] = 0xff; scinbuf[1] = 0; scinbuf[2] = (rank > 0) ? 0x3c : 0xc3; scoutbuf[0] = 0xf; scoutbuf[1] = 1; scoutbuf[2] = 1; rc = MPI_Reduce( scinbuf, scoutbuf, 3, MPI_SIGNED_CHAR, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_SIGNED_CHAR", rc ); errs++; } else { if (rank == 0) { if (scoutbuf[0] != ((size % 2) ? (signed char)0xff : (signed char)0) ) { errs++; fprintf( stderr, "signed char BXOR(1) test failed\n" ); } if (scoutbuf[1]) { errs++; fprintf( stderr, "signed char BXOR(0) test failed\n" ); } if (scoutbuf[2] != ((size % 2) ? (signed char)0xc3 : (signed char)0xff)) { errs++; fprintf( stderr, "signed char BXOR(>) test failed\n" ); } } } /* unsigned char */ MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED_CHAR\n" ); ucinbuf[0] = 0xff; ucinbuf[1] = 0; ucinbuf[2] = (rank > 0) ? 0x3c : 0xc3; ucoutbuf[0] = 0; ucoutbuf[1] = 1; ucoutbuf[2] = 1; rc = MPI_Reduce( ucinbuf, ucoutbuf, 3, MPI_UNSIGNED_CHAR, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED_CHAR", rc ); errs++; } else { if (rank == 0) { if (ucoutbuf[0] != ((size % 2) ? 0xff : 0)) { errs++; fprintf( stderr, "unsigned char BXOR(1) test failed\n" ); } if (ucoutbuf[1]) { errs++; fprintf( stderr, "unsigned char BXOR(0) test failed\n" ); } if (ucoutbuf[2] != ((size % 2) ? (unsigned char)0xc3 : (unsigned char)0xff)) { errs++; fprintf( stderr, "unsigned char BXOR(>) test failed\n" ); } } } /* bytes */ MTestPrintfMsg( 10, "Reduce of MPI_BYTE\n" ); cinbuf[0] = 0xff; cinbuf[1] = 0; cinbuf[2] = (rank > 0) ? 0x3c : 0xc3; coutbuf[0] = 0; coutbuf[1] = 1; coutbuf[2] = 1; rc = MPI_Reduce( cinbuf, coutbuf, 3, MPI_BYTE, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_BYTE", rc ); errs++; } else { if (rank == 0) { if (coutbuf[0] != ((size % 2) ? (char)0xff : 0)) { errs++; fprintf( stderr, "byte BXOR(1) test failed\n" ); } if (coutbuf[1]) { errs++; fprintf( stderr, "byte BXOR(0) test failed\n" ); } if (coutbuf[2] != ((size % 2) ? (char)0xc3 : (char)0xff)) { errs++; fprintf( stderr, "byte BXOR(>) test failed\n" ); } } } /* short */ MTestPrintfMsg( 10, "Reduce of MPI_SHORT\n" ); sinbuf[0] = 0xffff; sinbuf[1] = 0; sinbuf[2] = (rank > 0) ? 0x3c3c : 0xc3c3; soutbuf[0] = 0; soutbuf[1] = 1; soutbuf[2] = 1; rc = MPI_Reduce( sinbuf, soutbuf, 3, MPI_SHORT, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_SHORT", rc ); errs++; } else { if (rank == 0) { if (soutbuf[0] != ((size % 2) ? (short)0xffff : 0)) { errs++; fprintf( stderr, "short BXOR(1) test failed\n" ); } if (soutbuf[1]) { errs++; fprintf( stderr, "short BXOR(0) test failed\n" ); } if (soutbuf[2] != ((size % 2) ? (short)0xc3c3 : (short)0xffff)) { errs++; fprintf( stderr, "short BXOR(>) test failed\n" ); } } } /* unsigned short */ MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED_SHORT\n" ); usinbuf[0] = 0xffff; usinbuf[1] = 0; usinbuf[2] = (rank > 0) ? 0x3c3c : 0xc3c3; usoutbuf[0] = 0; usoutbuf[1] = 1; usoutbuf[2] = 1; rc = MPI_Reduce( usinbuf, usoutbuf, 3, MPI_UNSIGNED_SHORT, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED_SHORT", rc ); errs++; } else { if (rank == 0) { if (usoutbuf[0] != ((size % 2) ? 0xffff : 0)) { errs++; fprintf( stderr, "short BXOR(1) test failed\n" ); } if (usoutbuf[1]) { errs++; fprintf( stderr, "short BXOR(0) test failed\n" ); } if (usoutbuf[2] != ((size % 2) ? 0xc3c3 : 0xffff)) { errs++; fprintf( stderr, "short BXOR(>) test failed\n" ); } } } /* unsigned */ MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED\n" ); uinbuf[0] = 0xffffffff; uinbuf[1] = 0; uinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3; uoutbuf[0] = 0; uoutbuf[1] = 1; uoutbuf[2] = 1; rc = MPI_Reduce( uinbuf, uoutbuf, 3, MPI_UNSIGNED, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED", rc ); errs++; } else { if (rank == 0) { if (uoutbuf[0] != ((size % 2) ? 0xffffffff : 0)) { errs++; fprintf( stderr, "unsigned BXOR(1) test failed\n" ); } if (uoutbuf[1]) { errs++; fprintf( stderr, "unsigned BXOR(0) test failed\n" ); } if (uoutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) { errs++; fprintf( stderr, "unsigned BXOR(>) test failed\n" ); } } } /* int */ MTestPrintfMsg( 10, "Reduce of MPI_INT\n" ); iinbuf[0] = 0xffffffff; iinbuf[1] = 0; iinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3; ioutbuf[0] = 0; ioutbuf[1] = 1; ioutbuf[2] = 1; rc = MPI_Reduce( iinbuf, ioutbuf, 3, MPI_INT, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_INT", rc ); errs++; } else { if (rank == 0) { if (ioutbuf[0] != ((size % 2) ? 0xffffffff : 0)) { errs++; fprintf( stderr, "int BXOR(1) test failed\n" ); } if (ioutbuf[1]) { errs++; fprintf( stderr, "int BXOR(0) test failed\n" ); } if (ioutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) { errs++; fprintf( stderr, "int BXOR(>) test failed\n" ); } } } /* long */ MTestPrintfMsg( 10, "Reduce of MPI_LONG\n" ); linbuf[0] = 0xffffffff; linbuf[1] = 0; linbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3; loutbuf[0] = 0; loutbuf[1] = 1; loutbuf[2] = 1; rc = MPI_Reduce( linbuf, loutbuf, 3, MPI_LONG, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_LONG", rc ); errs++; } else { if (rank == 0) { if (loutbuf[0] != ((size % 2) ? 0xffffffff : 0)) { errs++; fprintf( stderr, "long BXOR(1) test failed\n" ); } if (loutbuf[1]) { errs++; fprintf( stderr, "long BXOR(0) test failed\n" ); } if (loutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) { errs++; fprintf( stderr, "long BXOR(>) test failed\n" ); } } } /* unsigned long */ MTestPrintfMsg( 10, "Reduce of MPI_UNSIGNED_LONG\n" ); ulinbuf[0] = 0xffffffff; ulinbuf[1] = 0; ulinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3; uloutbuf[0] = 0; uloutbuf[1] = 1; uloutbuf[2] = 1; rc = MPI_Reduce( ulinbuf, uloutbuf, 3, MPI_UNSIGNED_LONG, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_UNSIGNED_LONG", rc ); errs++; } else { if (rank == 0) { if (uloutbuf[0] != ((size % 2) ? 0xffffffff : 0)) { errs++; fprintf( stderr, "unsigned long BXOR(1) test failed\n" ); } if (uloutbuf[1]) { errs++; fprintf( stderr, "unsigned long BXOR(0) test failed\n" ); } if (uloutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) { errs++; fprintf( stderr, "unsigned long BXOR(>) test failed\n" ); } } } #ifdef HAVE_LONG_LONG { long long llinbuf[3], lloutbuf[3]; /* long long */ llinbuf[0] = 0xffffffff; llinbuf[1] = 0; llinbuf[2] = (rank > 0) ? 0x3c3c3c3c : 0xc3c3c3c3; lloutbuf[0] = 0; lloutbuf[1] = 1; lloutbuf[2] = 1; if (MPI_LONG_LONG != MPI_DATATYPE_NULL) { MTestPrintfMsg( 10, "Reduce of MPI_LONG_LONG\n" ); rc = MPI_Reduce( llinbuf, lloutbuf, 3, MPI_LONG_LONG, MPI_BXOR, 0, comm ); if (rc) { MTestPrintErrorMsg( "MPI_BXOR and MPI_LONG_LONG", rc ); errs++; } else { if (rank == 0) { if (lloutbuf[0] != ((size % 2) ? 0xffffffff : 0)) { errs++; fprintf( stderr, "long long BXOR(1) test failed\n" ); } if (lloutbuf[1]) { errs++; fprintf( stderr, "long long BXOR(0) test failed\n" ); } if (lloutbuf[2] != ((size % 2) ? 0xc3c3c3c3 : 0xffffffff)) { errs++; fprintf( stderr, "long long BXOR(>) test failed\n" ); } } } } } #endif MPI_Errhandler_set( comm, MPI_ERRORS_ARE_FATAL ); MTest_Finalize( errs ); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int rank; int size; int i; int status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); unsigned long long* sb = (unsigned long long *) xbt_malloc(size * sizeof(unsigned long long)); unsigned long long* rb = (unsigned long long *) xbt_malloc(size * sizeof(unsigned long long)); for (i = 0; i < size; ++i) { sb[i] = rank*size + i; rb[i] = 0; } printf("[%d] sndbuf=[", rank); for (i = 0; i < size; i++) printf("%llu ", sb[i]); printf("]\n"); int root=0; status = MPI_Reduce(sb, rb, size, MPI_UNSIGNED_LONG_LONG, MPI_SUM, root, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (rank == root) { printf("[%d] rcvbuf=[", rank); for (i = 0; i < size; i++) printf("%llu ", rb[i]); printf("]\n"); if (status != MPI_SUCCESS) { printf("all_to_all returned %d\n", status); fflush(stdout); } } printf("[%d] second sndbuf=[", rank); for (i = 0; i < 1; i++) printf("%llu ", sb[i]); printf("]\n"); root=size-1; status = MPI_Reduce(sb, rb, 1, MPI_UNSIGNED_LONG_LONG, MPI_PROD, root, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (rank == root) { printf("[%d] rcvbuf=[", rank); for (i = 0; i < 1; i++) printf("%llu ", rb[i]); printf("]\n"); if (status != MPI_SUCCESS) { printf("all_to_all returned %d\n", status); fflush(stdout); } } free(sb); free(rb); MPI_Finalize(); return (EXIT_SUCCESS); }
void pion_norm(const int traj, const int id) { int i, j, z, zz, z0; double *Cpp; double res = 0.; double pionnorm; double atime, etime; float tmp; #ifdef MPI double mpi_res = 0.; #endif FILE *ofs, *ofs2; char *filename, *filename2, *sourcefilename; char buf[100]; char buf2[100]; char buf3[100]; filename=buf; filename2=buf2; sourcefilename=buf3; sprintf(filename,"pionnormcorrelator_finiteT.%.6d",traj); sprintf(filename2,"%s", "pion_norm.data"); /* generate random source point */ if(ranlxs_init == 0) { rlxs_init(1, 123456); } ranlxs(&tmp, 1); z0 = (int)(measurement_list[id].max_source_slice*tmp); #ifdef MPI MPI_Bcast(&z0, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif #ifdef MPI atime = MPI_Wtime(); #else atime = (double)clock()/(double)(CLOCKS_PER_SEC); #endif Cpp = (double*) calloc(g_nproc_z*LZ, sizeof(double)); printf("Doing finite Temperature online measurement\n"); /* stochastic source in z-slice */ source_generation_pion_zdir(g_spinor_field[0], g_spinor_field[1], z0, 0, traj); /* invert on the stochastic source */ invert_eo(g_spinor_field[2], g_spinor_field[3], g_spinor_field[0], g_spinor_field[1], 1.e-14, measurement_list[id].max_iter, CG, 1, 0, 1, 0, NULL, -1); /* now we bring it to normal format */ /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */ convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]); /* now we sums only over local space for every z */ for(z = 0; z < LZ; z++) { res = 0.; /* sum here over all points in one z-slice we have to look up g_ipt*/ j = g_ipt[0][0][0][z]; for(i = 0; i < T*LX*LY ; i++) { res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][j], g_spinor_field[DUM_MATRIX][j]); j += LZ; /* jump LZ sites in array, z ist fastest index */ } #if defined MPI MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_z_slices); res = mpi_res; #endif Cpp[z+g_proc_coords[3]*LZ] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_t*T)*2.; } #ifdef MPI /* some gymnastics needed in case of parallelisation */ if(g_mpi_z_rank == 0) { MPI_Gather(&Cpp[g_proc_coords[3]*LZ], LZ, MPI_DOUBLE, Cpp, LZ, MPI_DOUBLE, 0, g_mpi_ST_slices); } #endif /* and write everything into a file */ if(g_mpi_z_rank == 0 && g_proc_coords[3] == 0) { ofs = fopen(filename, "w"); fprintf( ofs, "1 1 0 %e %e\n", Cpp[z0], 0.); for(z = 1; z < g_nproc_z*LZ/2; z++) { zz = (z0+z)%(g_nproc_z*LZ); fprintf( ofs, "1 1 %d %e ", z, Cpp[zz]); zz = (z0+g_nproc_z*LZ-z)%(g_nproc_z*LZ); fprintf( ofs, "%e\n", Cpp[zz]); } zz = (z0+g_nproc_z*LZ/2)%(g_nproc_z*LZ); fprintf( ofs, "1 1 %d %e %e\n", z, Cpp[zz], 0.); fclose(ofs); /* sum over all Cpp to get pionnorm*/ ofs2 = fopen(filename2, "a"); pionnorm = 0.; for(z=0; z<g_nproc_z*LZ; z++){ pionnorm += Cpp[z]; } /* normalize */ pionnorm = pionnorm/(g_nproc_z*LZ); fprintf(ofs2,"%d\t %.16e\n",traj,pionnorm); fclose(ofs2); } free(Cpp); #ifdef MPI etime = MPI_Wtime(); #else etime = (double)clock()/(double)(CLOCKS_PER_SEC); #endif if(g_proc_id == 0 && g_debug_level > 0) { printf("PIONNORM : measurement done int t/s = %1.4e\n", etime - atime); } return; }
void compute_error_p3dfft(double * A,int* n) { double pi=4*atan(1.0); long long int size=n[0]; size*=n[1]; size*=n[2]; int istart[3],iend[3],isize[3]; int xx=0.,yy=0.,zz=0.; //double an,aa; Cp3dfft_get_dims(istart,iend,isize,1); int num_th= omp_get_max_threads(); double diff_th[num_th],norm_th[num_th]; for (int i=0;i<num_th;i++){ diff_th[i]=0.;norm_th[i]=0.; } int procid; MPI_Comm_rank(MPI_COMM_WORLD,&procid); //First Determine the constant difference between analytic and numerical solutions #pragma omp parallel { long int ptr; double X,Y,Z; double analytic_solution,diff=0.0; int thid=omp_get_thread_num(); // double aa=0.,an=0.; #pragma omp for for (int k=0; k<isize[2]; k++){ for (int j=0; j<isize[1]; j++){ for (int i=0; i<isize[0]; i++){ X=2*pi/n[0]*(istart[0]-1+i); Y=2*pi/n[1]*(istart[1]-1+j); Z=2*pi/n[2]*(istart[2]-1+k); ptr=i+isize[0]*j+isize[0]*isize[1]*k; analytic_solution=testcase(X,Y,Z); diff=std::abs(A[ptr]/size-analytic_solution); diff_th[thid]+=diff; norm_th[thid]+=analytic_solution; //std::cout<<"x,y,z= "<<X<<Y<<Z<<" diff= "<<diff<<" analytic= "<<analytic_solution<<" numerical= "<<A[ptr]/size<<std::endl; } } } } double DIFF=0.,NORM=0.; for (int i=0; i<num_th;i++){ DIFF+=diff_th[i]; NORM+=norm_th[i]; } double gerr=0,gnorm=0; MPI_Reduce(&DIFF,&gerr,1, MPI_DOUBLE, MPI_SUM,0, MPI_COMM_WORLD); MPI_Reduce(&NORM,&gnorm,1, MPI_DOUBLE, MPI_SUM,0, MPI_COMM_WORLD); PCOUT<<"The L1 error between iFF(a)-a is == "<<gerr<<std::endl; PCOUT<<"The Rel. L1 error between iFF(a)-a is == "<<gerr/gnorm<<std::endl; if(gerr/gnorm>1e-10){ PCOUT<<"\033[1;31m ERROR!!! FFT not computed correctly!\033[0m"<<std::endl; } else{ PCOUT<<"\033[1;36m FFT computed correctly!\033[0m"<<std::endl; } }
void reb_output_timing(struct reb_simulation* r, const double tmax){ const int N = r->N; #ifdef MPI int N_tot = 0; MPI_Reduce(&N, &N_tot, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (r->mpi_id!=0) return; #else int N_tot = N; #endif struct timeval tim; gettimeofday(&tim, NULL); double temp = tim.tv_sec+(tim.tv_usec/1000000.0); if (r->output_timing_last==-1){ r->output_timing_last = temp; }else{ printf("\r"); #ifdef PROFILING fputs("\033[A\033[2K",stdout); for (int i=0;i<=PROFILING_CAT_NUM;i++){ fputs("\033[A\033[2K",stdout); } #endif // PROFILING } printf("N_tot= %- 9d ",N_tot); if (r->integrator==REB_INTEGRATOR_SEI){ printf("t= %- 9f [orb] ",r->t*r->ri_sei.OMEGA/2./M_PI); }else{ printf("t= %- 9f ",r->t); } printf("dt= %- 9f ",r->dt); if (r->integrator==REB_INTEGRATOR_HYBRID){ printf("INT= %- 1d ",r->ri_hybrid.mode); } printf("cpu= %- 9f [s] ",temp-r->output_timing_last); if (tmax>0){ printf("t/tmax= %5.2f%%",r->t/tmax*100.0); } #ifdef PROFILING if (profiling_timing_initial==0){ struct timeval tim; gettimeofday(&tim, NULL); profiling_timing_initial = tim.tv_sec+(tim.tv_usec/1000000.0); } printf("\nCATEGORY TIME \n"); double _sum = 0; for (int i=0;i<=PROFILING_CAT_NUM;i++){ switch (i){ case PROFILING_CAT_INTEGRATOR: printf("Integrator "); break; case PROFILING_CAT_BOUNDARY: printf("Boundary check "); break; case PROFILING_CAT_GRAVITY: printf("Gravity/Forces "); break; case PROFILING_CAT_COLLISION: printf("Collisions "); break; #ifdef OPENGL case PROFILING_CAT_VISUALIZATION: printf("Visualization "); break; #endif // OPENGL case PROFILING_CAT_NUM: printf("Other "); break; } if (i==PROFILING_CAT_NUM){ printf("%5.2f%%",(1.-_sum/(profiling_time_final - profiling_timing_initial))*100.); }else{ printf("%5.2f%%\n",profiling_time_sum[i]/(profiling_time_final - profiling_timing_initial)*100.); _sum += profiling_time_sum[i]; } } #endif // PROFILING fflush(stdout); r->output_timing_last = temp; }
int main (int argc, char * argv[]) { int rank,size; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); int X,Y,x,y,X_ext,i; double **A, **localA; X=atoi(argv[1]); Y=X; //Extend dimension X with ghost cells if X%size!=0 if (X%size!=0) X_ext=X+size-X%size; else X_ext=X; if (rank==0) { //Allocate and init matrix A A=malloc2D(X_ext,Y); init2D(A,X,Y); } //Local dimensions x,y x=X_ext/size; y=Y; //Allocate local matrix and scatter global matrix localA=malloc2D(x,y); double * idx; for (i=0;i<x;i++) { if (rank==0) idx=&A[i*size][0]; MPI_Scatter(idx,Y,MPI_DOUBLE,&localA[i][0],y,MPI_DOUBLE,0,MPI_COMM_WORLD); } if (rank==0) free2D(A,X_ext,Y); //Timers struct timeval ts,tf,comps,compf,comms,commf; double total_time,computation_time,communication_time; MPI_Barrier(MPI_COMM_WORLD); gettimeofday(&ts,NULL); /****************************************************************************** The matrix A is distributed in a round-robin fashion to the local matrices localA You have to use point-to-point communication routines. Don't forget the timers for computation and communication! ******************************************************************************/ int line_index, line_owner; int k, start; double *k_row, *temp; MPI_Status status; temp = malloc(y * sizeof(*temp)); // k_row = malloc(y * sizeof(*k_row)); /* omoia me to allo cyclic, vriskoume ton line_owner */ for (k=0; k<y-1; k++){ line_owner = k % size; line_index = k / size; if (rank <= line_owner) start = k / size + 1; else start = k / size; if (rank == line_owner) k_row = localA[line_index]; else k_row = temp; /* set communication timer */ gettimeofday(&comms, NULL); /* COMM */ // if (rank != line_owner){ // if (rank == 0) // MPI_Recv( k_row, y, MPI_DOUBLE, size-1, MPI_ANY_SOURCE, MPI_COMM_WORLD, &status); // else // MPI_Recv( k_row, y, MPI_DOUBLE, rank-1, MPI_ANY_SOURCE, MPI_COMM_WORLD, &status); // } // // /* autos pou einai prin ton line_owner den prepei na steilei */ // if (rank != line_owner -1){ // /* o teleutaios prepei na steilei ston prwto, ektos an o prwtos einai o line_owner */ // if (rank == size-1) { // if (line_owner != 0) // MPI_Send( k_row, y, MPI_DOUBLE, 0, rank, MPI_COMM_WORLD); // } // else // MPI_Send(k_row, y, MPI_DOUBLE, rank+1, rank, MPI_COMM_WORLD); // } /* o line_owner stelnei se olous (ektos tou eautou tou) kai oloi oi alloi kanoun * receive */ if (rank == line_owner){ for (i=0; i<size; i++) if (i != line_owner) MPI_Send( k_row, y, MPI_DOUBLE, i, line_owner, MPI_COMM_WORLD); } else MPI_Recv(k_row, y, MPI_DOUBLE, line_owner, line_owner, MPI_COMM_WORLD, &status); /* stop communication timer */ gettimeofday(&commf, NULL); communication_time += commf.tv_sec - comms.tv_sec + (commf.tv_usec - comms.tv_usec)*0.000001; /* set computation timer */ gettimeofday(&comps, NULL); /* Compute */ go_to_work( localA, k_row, x, y, rank, start, k ); /* stop computation timer */ gettimeofday(&compf, NULL); computation_time += compf.tv_sec - comps.tv_sec + (compf.tv_usec - comps.tv_usec)*0.000001; } gettimeofday(&tf,NULL); total_time=tf.tv_sec-ts.tv_sec+(tf.tv_usec-ts.tv_usec)*0.000001; //Gather local matrices back to the global matrix if (rank==0) A=malloc2D(X_ext,Y); for (i=0;i<x;i++) { if (rank==0) idx=&A[i*size][0]; MPI_Gather(&localA[i][0],y,MPI_DOUBLE,idx,Y,MPI_DOUBLE,0,MPI_COMM_WORLD); } double avg_total,avg_comp,avg_comm,max_total,max_comp,max_comm; MPI_Reduce(&total_time,&max_total,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&computation_time,&max_comp,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&communication_time,&max_comm,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); MPI_Reduce(&total_time,&avg_total,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Reduce(&computation_time,&avg_comp,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Reduce(&communication_time,&avg_comm,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); avg_total/=size; avg_comp/=size; avg_comm/=size; if (rank==0) { printf("LU-Cyclic-p2p\tSize\t%d\tProcesses\t%d\n",X,size); printf("Max times:\tTotal\t%lf\tComp\t%lf\tComm\t%lf\n",max_total,max_comp,max_comm); printf("Avg times:\tTotal\t%lf\tComp\t%lf\tComm\t%lf\n",avg_total,avg_comp,avg_comm); } //Print triangular matrix U to file if (rank==0) { char * filename="output_cyclic_p2p"; print2DFile(A,X,Y,filename); } MPI_Finalize(); return 0; }
int main( int argc, char *argv[]) { int n, myid, numprocs, i, j; double PI25DT = 3.141592653589793238462643; double mypi, pi, h, sum, x; double startwtime = 0.0, endwtime; int namelen; int event1a, event1b, event2a, event2b, event3a, event3b, event4a, event4b; char processor_name[MPI_MAX_PROCESSOR_NAME]; MPI_Init(&argc,&argv); MPI_Pcontrol( 0 ); MPI_Comm_size(MPI_COMM_WORLD,&numprocs); MPI_Comm_rank(MPI_COMM_WORLD,&myid); MPI_Get_processor_name(processor_name,&namelen); fprintf(stderr,"Process %d running on %s\n", myid, processor_name); /* MPE_Init_log() & MPE_Finish_log() are NOT needed when liblmpe.a is linked with this program. In that case, MPI_Init() would have called MPE_Init_log() already. */ /* MPE_Init_log(); */ /* Get event ID from MPE, user should NOT assign event ID */ event1a = MPE_Log_get_event_number(); event1b = MPE_Log_get_event_number(); event2a = MPE_Log_get_event_number(); event2b = MPE_Log_get_event_number(); event3a = MPE_Log_get_event_number(); event3b = MPE_Log_get_event_number(); event4a = MPE_Log_get_event_number(); event4b = MPE_Log_get_event_number(); if (myid == 0) { MPE_Describe_state(event1a, event1b, "Broadcast", "red"); MPE_Describe_state(event2a, event2b, "Compute", "blue"); MPE_Describe_state(event3a, event3b, "Reduce", "green"); MPE_Describe_state(event4a, event4b, "Sync", "orange"); } if (myid == 0) { n = 1000000; startwtime = MPI_Wtime(); } MPI_Barrier(MPI_COMM_WORLD); MPI_Pcontrol( 1 ); /* MPE_Start_log(); */ for (j = 0; j < 5; j++) { MPE_Log_event(event1a, 0, NULL); MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); MPE_Log_event(event1b, 0, NULL); MPE_Log_event(event4a, 0, NULL); MPI_Barrier(MPI_COMM_WORLD); MPE_Log_event(event4b, 0, NULL); MPE_Log_event(event2a, 0, NULL); h = 1.0 / (double) n; sum = 0.0; for (i = myid + 1; i <= n; i += numprocs) { x = h * ((double)i - 0.5); sum += f(x); } mypi = h * sum; MPE_Log_event(event2b, 0, NULL); MPE_Log_event(event3a, 0, NULL); MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPE_Log_event(event3b, 0, NULL); } /* MPE_Finish_log("cpilog"); */ if (myid == 0) { endwtime = MPI_Wtime(); printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT)); printf("wall clock time = %f\n", endwtime-startwtime); } MPI_Finalize(); return(0); }
int main(int argc, char ** argv) { int Num_procs; // number of ranks int Num_procsx, Num_procsy; // number of ranks in each coord direction int args_used = 1; // keeps track of # consumed arguments int my_ID; // MPI rank int my_IDx, my_IDy; // coordinates of rank in rank grid int root = 0; // master rank uint64_t L; // dimension of grid in cells uint64_t iterations ; // total number of simulation steps uint64_t n; // total number of particles requested in the simulation uint64_t actual_particles, // actual number of particles owned by my rank total_particles; // total number of generated particles char *init_mode; // particle initialization mode (char) double rho ; // attenuation factor for geometric particle distribution uint64_t k, m; // determine initial horizontal and vertical velocity of // particles-- (2*k)+1 cells per time step double *grid; // the grid is represented as an array of charges uint64_t iter, i; // dummies double fx, fy, ax, ay; // particle forces and accelerations int error=0; // used for graceful exit after error uint64_t correctness=0; // boolean indicating correct particle displacements uint64_t istart, jstart, iend, jend, particles_size, particles_count; bbox_t grid_patch, // whole grid init_patch, // subset of grid used for localized initialization my_tile; // subset of grid owner by my rank particle_t *particles, *p; // array of particles owned by my rank uint64_t *cur_counts; // uint64_t ptr_my; // uint64_t owner; // owner (rank) of a particular particle double pic_time, local_pic_time, avg_time; uint64_t my_checksum = 0, tot_checksum = 0, correctness_checksum = 0; uint64_t width, height; // minimum dimensions of grid tile owned by my rank int particle_mode; // type of initialization double alpha, beta; // negative slope and offset for linear initialization int nbr[8]; // topological neighbor ranks int icrit, jcrit; // global grid indices where tile size drops to minimum find_owner_type find_owner; int ileftover, jleftover;// excess grid points divided among "fat" tiles uint64_t to_send[8], to_recv[8];// int procsize; // number of ranks per OS process MPI_Status status[16]; MPI_Request requests[16]; /* Initialize the MPI environment */ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /* FIXME: This can be further improved */ /* Create MPI data type for particle_t */ MPI_Datatype PARTICLE; MPI_Type_contiguous(sizeof(particle_t)/sizeof(double), MPI_DOUBLE, &PARTICLE); MPI_Type_commit( &PARTICLE ); if (my_ID==root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("FG_MPI Particle-in-Cell execution on 2D grid\n"); if (argc<6) { printf("Usage: %s <#simulation steps> <grid size> <#particles> <k (particle charge semi-increment)> ", argv[0]); printf("<m (vertical particle velocity)>\n"); printf(" <init mode> <init parameters>]\n"); printf(" init mode \"GEOMETRIC\" parameters: <attenuation factor>\n"); printf(" \"SINUSOIDAL\" parameters: none\n"); printf(" \"LINEAR\" parameters: <negative slope> <constant offset>\n"); printf(" \"PATCH\" parameters: <xleft> <xright> <ybottom> <ytop>\n"); error = 1; goto ENDOFTESTS; } iterations = atol(*++argv); args_used++; if (iterations<1) { printf("ERROR: Number of time steps must be positive: %llu\n", iterations); error = 1; goto ENDOFTESTS; } L = atol(*++argv); args_used++; if (L<1 || L%2) { printf("ERROR: Number of grid cells must be positive and even: %llu\n", L); error = 1; goto ENDOFTESTS; } n = atol(*++argv); args_used++; if (n<1) { printf("ERROR: Number of particles must be positive: %llu\n", n); error = 1; goto ENDOFTESTS; } particle_mode = UNDEFINED; k = atoi(*++argv); args_used++; if (k<0) { printf("ERROR: Particle semi-charge must be non-negative: %llu\n", k); error = 1; goto ENDOFTESTS; } m = atoi(*++argv); args_used++; init_mode = *++argv; args_used++; ENDOFTESTS:; } // done with standard initialization parameters bail_out(error); MPI_Bcast(&iterations, 1, MPI_UINT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&L, 1, MPI_UINT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&n, 1, MPI_UINT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&k, 1, MPI_UINT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&m, 1, MPI_UINT64_T, root, MPI_COMM_WORLD); grid_patch = (bbox_t){0, L+1, 0, L+1}; if (my_ID==root) { // process initialization parameters /* Initialize particles with geometric distribution */ if (strcmp(init_mode, "GEOMETRIC") == 0) { if (argc<args_used+1) { printf("ERROR: Not enough arguments for GEOMETRIC\n"); error = 1; goto ENDOFTESTS2; } particle_mode = GEOMETRIC; rho = atof(*++argv); args_used++; } /* Initialize with a sinusoidal particle distribution (single period) */ if (strcmp(init_mode, "SINUSOIDAL") == 0) { particle_mode = SINUSOIDAL; } /* Initialize particles with linear distribution */ /* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/ if (strcmp(init_mode, "LINEAR") == 0) { if (argc<args_used+2) { printf("ERROR: Not enough arguments for LINEAR initialization\n"); error = 1; goto ENDOFTESTS2; exit(EXIT_FAILURE); } particle_mode = LINEAR; alpha = atof(*++argv); args_used++; beta = atof(*++argv); args_used++; if (beta <0 || beta<alpha) { printf("ERROR: linear profile gives negative particle density\n"); error = 1; goto ENDOFTESTS2; } } /* Initialize particles uniformly within a "patch" */ if (strcmp(init_mode, "PATCH") == 0) { if (argc<args_used+4) { printf("ERROR: Not enough arguments for PATCH initialization\n"); error = 1; goto ENDOFTESTS2; } particle_mode = PATCH; init_patch.left = atoi(*++argv); args_used++; init_patch.right = atoi(*++argv); args_used++; init_patch.bottom = atoi(*++argv); args_used++; init_patch.top = atoi(*++argv); args_used++; if (bad_patch(&init_patch, &grid_patch)) { printf("ERROR: inconsistent initial patch\n"); error = 1; goto ENDOFTESTS2; } } ENDOFTESTS2:; } //done with processing initializaton parameters, now broadcast bail_out(error); MPI_Bcast(&particle_mode, 1, MPI_INT, root, MPI_COMM_WORLD); switch (particle_mode) { case GEOMETRIC: MPI_Bcast(&rho, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); break; case SINUSOIDAL: break; case LINEAR: MPI_Bcast(&alpha, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&beta, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); break; case PATCH: MPI_Bcast(&init_patch.left, 1, MPI_INT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&init_patch.right, 1, MPI_INT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&init_patch.bottom, 1, MPI_INT64_T, root, MPI_COMM_WORLD); MPI_Bcast(&init_patch.top, 1, MPI_INT64_T, root, MPI_COMM_WORLD); break; } /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; if (my_ID == root) { MPIX_Get_collocated_size(&procsize); printf("Number of ranks = %llu\n", Num_procs); printf("Number of ranks/process = %d\n", procsize); printf("Load balancing = None\n"); printf("Grid size = %llu\n", L); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Number of particles requested = %llu\n", n); printf("Number of time steps = %llu\n", iterations); printf("Initialization mode = %s\n", init_mode); switch(particle_mode) { case GEOMETRIC: printf(" Attenuation factor = %lf\n", rho); break; case SINUSOIDAL: break; case LINEAR: printf(" Negative slope = %lf\n", alpha); printf(" Offset = %lf\n", beta); break; case PATCH: printf(" Bounding box = %llu, %llu, %llu, %llu\n", init_patch.left, init_patch.right, init_patch.bottom, init_patch.top); break; default: printf("ERROR: Unsupported particle initializating mode\n"); error = 1; } printf("Particle charge semi-increment (k) = %llu\n", k); printf("Vertical velocity (m) = %llu\n", m); } bail_out(error); /* The processes collectively create the underlying grid following a 2D block decomposition; unlike in the stencil code, successive blocks share an overlap vertex */ width = L/Num_procsx; if (width < 2*k) { if (my_ID==0) printf("k-value too large: %llu, must be no greater than %llu\n", k, width/2); bail_out(1); } ileftover = L%Num_procsx; if (my_IDx<ileftover) { istart = (width+1) * my_IDx; iend = istart + width + 1; } else { istart = (width+1) * ileftover + width * (my_IDx-ileftover); iend = istart + width; } icrit = (width+1) * ileftover; height = L/Num_procsy; if (height < m) { if (my_ID==0) printf("m-value too large: %llu, must be no greater than %llu\n", m, height); bail_out(1); } jleftover = L%Num_procsy; if (my_IDy<jleftover) { jstart = (height+1) * my_IDy; jend = jstart + height + 1; } else { jstart = (height+1) * jleftover + height * (my_IDy-jleftover); jend = jstart + height; } jcrit = (height+1) * jleftover; /* if the problem can be divided evenly among ranks, use the simple owner finding function */ if (icrit==0 && jcrit==0) { find_owner = find_owner_simple; if (my_ID==root) printf("Rank search mode used = simple\n"); } else { find_owner = find_owner_general; if (my_ID==root) printf("Rank search mode used = general\n"); } /* define bounding box for tile owned by my rank for convenience */ my_tile = (bbox_t){istart,iend,jstart,jend}; /* Find neighbors. Indexing: left=0, right=1, bottom=2, top=3, bottom-left=4, bottom-right=5, top-left=6, top-right=7 */ /* These are IDs in the global communicator */ nbr[0] = (my_IDx == 0 ) ? my_ID + Num_procsx - 1 : my_ID - 1; nbr[1] = (my_IDx == Num_procsx-1) ? my_ID - Num_procsx + 1 : my_ID + 1; nbr[2] = (my_IDy == Num_procsy-1) ? my_ID + Num_procsx - Num_procs : my_ID + Num_procsx; nbr[3] = (my_IDy == 0 ) ? my_ID - Num_procsx + Num_procs : my_ID - Num_procsx; nbr[4] = (my_IDy == Num_procsy-1) ? nbr[0] + Num_procsx - Num_procs : nbr[0] + Num_procsx; nbr[5] = (my_IDy == Num_procsy-1) ? nbr[1] + Num_procsx - Num_procs : nbr[1] + Num_procsx; nbr[6] = (my_IDy == 0 ) ? nbr[0] - Num_procsx + Num_procs : nbr[0] - Num_procsx; nbr[7] = (my_IDy == 0 ) ? nbr[1] - Num_procsx + Num_procs : nbr[1] - Num_procsx; grid = initializeGrid(my_tile); switch(particle_mode){ case GEOMETRIC: particles = initializeGeometric(n, L, rho, my_tile, k, m, &particles_count, &particles_size); break; case LINEAR: particles = initializeLinear(n, L, alpha, beta, my_tile, k, m, &particles_count, &particles_size); break; case SINUSOIDAL: particles = initializeSinusoidal(n, L, my_tile, k, m, &particles_count, &particles_size); break; case PATCH: particles = initializePatch(n, L, init_patch, my_tile, k, m, &particles_count, &particles_size); } if (!particles) { printf("ERROR: Rank %d could not allocate space for %llu particles\n", my_ID, particles_size); error=1; } bail_out(error); #if VERBOSE for (i=0; i<Num_procs; i++) { MPI_Barrier(MPI_COMM_WORLD); if (i == my_ID) printf("Rank %d has %llu particles\n", my_ID, particles_count); } #endif if (my_ID==root) { MPI_Reduce(&particles_count, &total_particles, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD); printf("Number of particles placed = %llu\n", total_particles); } else { MPI_Reduce(&particles_count, &total_particles, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD); } /* Allocate space for communication buffers. Adjust appropriately as the simulation proceeds */ uint64_t sendbuf_size[8], recvbuf_size[8]; particle_t *sendbuf[8], *recvbuf[8]; error=0; for (i=0; i<8; i++) { sendbuf_size[i] = MAX(1,n/(MEMORYSLACK*Num_procs)); recvbuf_size[i] = MAX(1,n/(MEMORYSLACK*Num_procs)); sendbuf[i] = (particle_t*) prk_malloc(sendbuf_size[i] * sizeof(particle_t)); recvbuf[i] = (particle_t*) prk_malloc(recvbuf_size[i] * sizeof(particle_t)); if (!sendbuf[i] || !recvbuf[i]) error++; } if (error) printf("Rank %d could not allocate communication buffers\n", my_ID); bail_out(error); /* Run the simulation */ for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_pic_time = wtime(); } ptr_my = 0; for (i=0; i<8; i++) to_send[i]=0; /* Process own particles */ p = particles; for (i=0; i < particles_count; i++) { fx = 0.0; fy = 0.0; computeTotalForce(p[i], my_tile, grid, &fx, &fy); ax = fx * MASS_INV; ay = fy * MASS_INV; /* Update particle positions, taking into account periodic boundaries */ p[i].x = fmod(p[i].x + p[i].v_x*DT + 0.5*ax*DT*DT + L, L); p[i].y = fmod(p[i].y + p[i].v_y*DT + 0.5*ay*DT*DT + L, L); /* Update velocities */ p[i].v_x += ax * DT; p[i].v_y += ay * DT; /* Check if particle stayed in same subdomain or moved to another */ owner = find_owner(p[i], width, height, Num_procsx, icrit, jcrit, ileftover, jleftover); if (owner==my_ID) { add_particle_to_buffer(p[i], &p, &ptr_my, &particles_size); /* Add particle to the appropriate communication buffer */ } else if (owner == nbr[0]) { add_particle_to_buffer(p[i], &sendbuf[0], &to_send[0], &sendbuf_size[0]); } else if (owner == nbr[1]) { add_particle_to_buffer(p[i], &sendbuf[1], &to_send[1], &sendbuf_size[1]); } else if (owner == nbr[2]) { add_particle_to_buffer(p[i], &sendbuf[2], &to_send[2], &sendbuf_size[2]); } else if (owner == nbr[3]) { add_particle_to_buffer(p[i], &sendbuf[3], &to_send[3], &sendbuf_size[3]); } else if (owner == nbr[4]) { add_particle_to_buffer(p[i], &sendbuf[4], &to_send[4], &sendbuf_size[4]); } else if (owner == nbr[5]) { add_particle_to_buffer(p[i], &sendbuf[5], &to_send[5], &sendbuf_size[5]); } else if (owner == nbr[6]) { add_particle_to_buffer(p[i], &sendbuf[6], &to_send[6], &sendbuf_size[6]); } else if (owner == nbr[7]) { add_particle_to_buffer(p[i], &sendbuf[7], &to_send[7], &sendbuf_size[7]); } else { printf("Could not find neighbor owner of particle %llu in tile %llu\n", i, owner); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } } /* Communicate the number of particles to be sent/received */ for (i=0; i<8; i++) { MPI_Isend(&to_send[i], 1, MPI_UINT64_T, nbr[i], 0, MPI_COMM_WORLD, &requests[i]); MPI_Irecv(&to_recv[i], 1, MPI_UINT64_T, nbr[i], 0, MPI_COMM_WORLD, &requests[8+i]); } MPI_Waitall(16, requests, status); /* Resize receive buffers if need be */ for (i=0; i<8; i++) { resize_buffer(&recvbuf[i], &recvbuf_size[i], to_recv[i]); } /* Communicate the particles */ for (i=0; i<8; i++) { MPI_Isend(sendbuf[i], to_send[i], PARTICLE, nbr[i], 0, MPI_COMM_WORLD, &requests[i]); MPI_Irecv(recvbuf[i], to_recv[i], PARTICLE, nbr[i], 0, MPI_COMM_WORLD, &requests[8+i]); } MPI_Waitall(16, requests, status); /* Attach received particles to particles buffer */ for (i=0; i<4; i++) { attach_received_particles(&particles, &ptr_my, &particles_size, recvbuf[2*i], to_recv[2*i], recvbuf[2*i+1], to_recv[2*i+1]); } particles_count = ptr_my; } local_pic_time = MPI_Wtime() - local_pic_time; MPI_Reduce(&local_pic_time, &pic_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* Run the verification test */ /* First verify own particles */ for (i=0; i < particles_count; i++) { correctness += verifyParticle(particles[i], (double)L, iterations); my_checksum += (uint64_t)particles[i].ID; } /* Gather total checksum of particles */ MPI_Reduce(&my_checksum, &tot_checksum, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD); /* Gather total checksum of correctness flags */ MPI_Reduce(&correctness, &correctness_checksum, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD); if ( my_ID == root) { if (correctness_checksum != total_particles ) { printf("ERROR: there are %llu miscalculated locations\n", total_particles-correctness_checksum); } else { if (tot_checksum != (total_particles*(total_particles+1))/2) { printf("ERROR: Particle checksum incorrect\n"); } else { avg_time = total_particles*iterations/pic_time; printf("Solution validates\n"); printf("Rate (Mparticles_moved/s): %lf\n", 1.0e-6*avg_time); } } } #if VERBOSE for (i=0; i<Num_procs; i++) { MPI_Barrier(MPI_COMM_WORLD); if (i == my_ID) printf("Rank %d has %llu particles\n", my_ID, particles_count); } #endif MPI_Finalize(); return 0; }
/* * This test looks at the handling of char and types that are not required * integers (e.g., long long). MPICH allows * these as well. A strict MPI test should not include this test. * * The rule on max loc is that if there is a tie in the value, the minimum * rank is used (see 4.9.3 in the MPI-1 standard) */ int main( int argc, char *argv[] ) { int errs = 0; int rank, size; MPI_Comm comm; MTest_Init( &argc, &argv ); comm = MPI_COMM_WORLD; MPI_Comm_rank( comm, &rank ); MPI_Comm_size( comm, &size ); /* 2 int */ { struct twoint { int val; int loc; } cinbuf[3], coutbuf[3]; cinbuf[0].val = 1; cinbuf[0].loc = rank; cinbuf[1].val = 0; cinbuf[1].loc = rank; cinbuf[2].val = rank; cinbuf[2].loc = rank; coutbuf[0].val = 0; coutbuf[0].loc = -1; coutbuf[1].val = 1; coutbuf[1].loc = -1; coutbuf[2].val = 1; coutbuf[2].loc = -1; MPI_Reduce( cinbuf, coutbuf, 3, MPI_2INT, MPI_MAXLOC, 0, comm ); if (rank == 0) { if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) { errs++; fprintf( stderr, "2int MAXLOC(1) test failed\n" ); } if (coutbuf[1].val != 0) { errs++; fprintf( stderr, "2int MAXLOC(0) test failed, value = %d, should be zero\n", coutbuf[1].val ); } if (coutbuf[1].loc != 0) { errs++; fprintf( stderr, "2int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc ); } if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) { errs++; fprintf( stderr, "2int MAXLOC(>) test failed\n" ); } } } /* float int */ { struct floatint { float val; int loc; } cinbuf[3], coutbuf[3]; cinbuf[0].val = 1; cinbuf[0].loc = rank; cinbuf[1].val = 0; cinbuf[1].loc = rank; cinbuf[2].val = (float)rank; cinbuf[2].loc = rank; coutbuf[0].val = 0; coutbuf[0].loc = -1; coutbuf[1].val = 1; coutbuf[1].loc = -1; coutbuf[2].val = 1; coutbuf[2].loc = -1; MPI_Reduce( cinbuf, coutbuf, 3, MPI_FLOAT_INT, MPI_MAXLOC, 0, comm ); if (rank == 0) { if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) { errs++; fprintf( stderr, "float-int MAXLOC(1) test failed\n" ); } if (coutbuf[1].val != 0) { errs++; fprintf( stderr, "float-int MAXLOC(0) test failed, value = %f, should be zero\n", coutbuf[1].val ); } if (coutbuf[1].loc != 0) { errs++; fprintf( stderr, "float-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc ); } if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) { errs++; fprintf( stderr, "float-int MAXLOC(>) test failed\n" ); } } } /* long int */ { struct longint { long val; int loc; } cinbuf[3], coutbuf[3]; cinbuf[0].val = 1; cinbuf[0].loc = rank; cinbuf[1].val = 0; cinbuf[1].loc = rank; cinbuf[2].val = rank; cinbuf[2].loc = rank; coutbuf[0].val = 0; coutbuf[0].loc = -1; coutbuf[1].val = 1; coutbuf[1].loc = -1; coutbuf[2].val = 1; coutbuf[2].loc = -1; MPI_Reduce( cinbuf, coutbuf, 3, MPI_LONG_INT, MPI_MAXLOC, 0, comm ); if (rank == 0) { if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) { errs++; fprintf( stderr, "long-int MAXLOC(1) test failed\n" ); } if (coutbuf[1].val != 0) { errs++; fprintf( stderr, "long-int MAXLOC(0) test failed, value = %ld, should be zero\n", coutbuf[1].val ); } if (coutbuf[1].loc != 0) { errs++; fprintf( stderr, "long-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc ); } if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) { errs++; fprintf( stderr, "long-int MAXLOC(>) test failed\n" ); } } } /* short int */ { struct shortint { short val; int loc; } cinbuf[3], coutbuf[3]; cinbuf[0].val = 1; cinbuf[0].loc = rank; cinbuf[1].val = 0; cinbuf[1].loc = rank; cinbuf[2].val = rank; cinbuf[2].loc = rank; coutbuf[0].val = 0; coutbuf[0].loc = -1; coutbuf[1].val = 1; coutbuf[1].loc = -1; coutbuf[2].val = 1; coutbuf[2].loc = -1; MPI_Reduce( cinbuf, coutbuf, 3, MPI_SHORT_INT, MPI_MAXLOC, 0, comm ); if (rank == 0) { if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) { errs++; fprintf( stderr, "short-int MAXLOC(1) test failed\n" ); } if (coutbuf[1].val != 0) { errs++; fprintf( stderr, "short-int MAXLOC(0) test failed, value = %d, should be zero\n", coutbuf[1].val ); } if (coutbuf[1].loc != 0) { errs++; fprintf( stderr, "short-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc ); } if (coutbuf[2].val != size-1) { errs++; fprintf( stderr, "short-int MAXLOC(>) test failed, value = %d, should be %d\n", coutbuf[2].val, size-1 ); } if (coutbuf[2].loc != size -1) { errs++; fprintf( stderr, "short-int MAXLOC(>) test failed, location of max = %d, should be %d\n", coutbuf[2].loc, size-1 ); } } } /* double int */ { struct doubleint { double val; int loc; } cinbuf[3], coutbuf[3]; cinbuf[0].val = 1; cinbuf[0].loc = rank; cinbuf[1].val = 0; cinbuf[1].loc = rank; cinbuf[2].val = rank; cinbuf[2].loc = rank; coutbuf[0].val = 0; coutbuf[0].loc = -1; coutbuf[1].val = 1; coutbuf[1].loc = -1; coutbuf[2].val = 1; coutbuf[2].loc = -1; MPI_Reduce( cinbuf, coutbuf, 3, MPI_DOUBLE_INT, MPI_MAXLOC, 0, comm ); if (rank == 0) { if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) { errs++; fprintf( stderr, "double-int MAXLOC(1) test failed\n" ); } if (coutbuf[1].val != 0) { errs++; fprintf( stderr, "double-int MAXLOC(0) test failed, value = %lf, should be zero\n", coutbuf[1].val ); } if (coutbuf[1].loc != 0) { errs++; fprintf( stderr, "double-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc ); } if (coutbuf[2].val != size-1 || coutbuf[2].loc != size-1) { errs++; fprintf( stderr, "double-int MAXLOC(>) test failed\n" ); } } } #ifdef HAVE_LONG_DOUBLE /* long double int */ { struct longdoubleint { long double val; int loc; } cinbuf[3], coutbuf[3]; /* avoid valgrind warnings about padding bytes in the long double */ memset(&cinbuf[0], 0, sizeof(cinbuf)); memset(&coutbuf[0], 0, sizeof(coutbuf)); cinbuf[0].val = 1; cinbuf[0].loc = rank; cinbuf[1].val = 0; cinbuf[1].loc = rank; cinbuf[2].val = rank; cinbuf[2].loc = rank; coutbuf[0].val = 0; coutbuf[0].loc = -1; coutbuf[1].val = 1; coutbuf[1].loc = -1; coutbuf[2].val = 1; coutbuf[2].loc = -1; if (MPI_LONG_DOUBLE != MPI_DATATYPE_NULL) { MPI_Reduce( cinbuf, coutbuf, 3, MPI_LONG_DOUBLE_INT, MPI_MAXLOC, 0, comm ); if (rank == 0) { if (coutbuf[0].val != 1 || coutbuf[0].loc != 0) { errs++; fprintf( stderr, "long double-int MAXLOC(1) test failed\n" ); } if (coutbuf[1].val != 0) { errs++; fprintf( stderr, "long double-int MAXLOC(0) test failed, value = %lf, should be zero\n", (double)coutbuf[1].val ); } if (coutbuf[1].loc != 0) { errs++; fprintf( stderr, "long double-int MAXLOC(0) test failed, location of max = %d, should be zero\n", coutbuf[1].loc ); } if (coutbuf[2].val != size-1) { errs++; fprintf( stderr, "long double-int MAXLOC(>) test failed, value = %lf, should be %d\n", (double)coutbuf[2].val, size-1 ); } if (coutbuf[2].loc != size-1) { errs++; fprintf( stderr, "long double-int MAXLOC(>) test failed, location of max = %d, should be %d\n", coutbuf[2].loc, size-1 ); } } } } #endif MTest_Finalize( errs ); MPI_Finalize(); return 0; }
/*------------------------------------------------*/ int main (int argc, char **argv) { int cols, rows, iter, particles, x, y; int *pic; PartStr *p, *changes, *totalChanges; int rank, num, i, numChanges, numTotalChanges; int *changesPerNode, *buffDispl; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &num); if (argc < 2) // use default values if user does not specify anything { cols = PIC_SIZE + 2; rows = PIC_SIZE + 2; iter = MAX_ITER; particles = PARTICLES; } else { cols = atoi (argv[1]) + 2; rows = atoi (argv[2]) + 2; particles = atoi (argv[3]); iter = atoi (argv[4]); } // initialize the random number generator srand(rank); // srand(time(0)); // this should be used instead if the program runs on multiple hosts int particlesPerNode = particles / num; if (rank == num - 1) particlesPerNode = particles - particlesPerNode * (num - 1); // in case particles cannot be split evenly // printf("%i has %i\n", rank, particlesPerNode); pic = (int *) malloc (sizeof (int) * cols * rows); p = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode); changes = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode); totalChanges = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode); changesPerNode = (int *) malloc (sizeof (int) * num); buffDispl = (int *) malloc (sizeof (int) * num); assert (pic != 0 && p != 0 && changes != 0 && totalChanges != 0 && changesPerNode != 0); // MPI user type declaration int lengths[2] = { 1, 1 }; MPI_Datatype types[2] = { MPI_INT, MPI_INT }; MPI_Aint add1, add2; MPI_Aint displ[2]; MPI_Datatype Point; MPI_Address (p, &add1); MPI_Address (&(p[0].y), &add2); displ[0] = 0; displ[1] = add2 - add1; MPI_Type_struct (2, lengths, displ, types, &Point); MPI_Type_commit (&Point); dla_init_plist (pic, rows, cols, p, particlesPerNode, 1); while (--iter) { dla_evolve_plist (pic, rows, cols, p, &particlesPerNode, changes, &numChanges); // printf("%i changed %i on iter %i : ",rank, numChanges, iter); // for(i=0;i<numChanges;i++) printf("(%i, %i) ", changes[i].x, changes[i].y); // printf("\n"); //exchange information with other nodes MPI_Allgather (&numChanges, 1, MPI_INT, changesPerNode, 1, MPI_INT, MPI_COMM_WORLD); //calculate offsets numTotalChanges = 0; for (i = 0; i < num; i++) { buffDispl[i] = numTotalChanges; numTotalChanges += changesPerNode[i]; } // if(rank==0) // { // for(i=0;i<num;i++) // printf("%i tries to send %i\n",i,changesPerNode[i]); // printf("-----------\n"); // } if(numTotalChanges >0) { MPI_Allgatherv (changes, numChanges, Point, totalChanges, changesPerNode, buffDispl, Point, MPI_COMM_WORLD); apply_changes (pic, rows, cols, totalChanges, numTotalChanges); // if(rank==0) // { // printf("Total changes %i : ", numTotalChanges); // for(i=0;i<numTotalChanges;i++) printf("(%i, %i) ", totalChanges[i].x, totalChanges[i].y); // // printf("\n"); // printf("-----------\n"); // } } } /* Print to stdout a PBM picture of the simulation space */ if (rank == 0) { printf ("P1\n%i %i\n", cols - 2, rows - 2); for (y = 1; y < rows - 1; y++) { for (x = 1; x < cols - 1; x++) { if (pic[y * cols + x] < 0) printf ("1 "); else printf ("0 "); } printf ("\n"); } } MPI_Reduce(&particlesPerNode, &particles, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if(rank==0) fprintf(stderr, "Remaining particles %i\n", particles); free (pic); free (p); free (changes); free (changesPerNode); free (buffDispl); MPI_Finalize (); return 0; }
void PStatPrint(superlu_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) { double *utime = stat->utime; flops_t *ops = stat->ops; int_t iam = grid->iam; flops_t flopcnt, factflop, solveflop; if ( options->PrintStat == NO ) return; if ( !iam && options->Fact != FACTORED ) { printf("**************************************************\n"); printf("**** Time (seconds) ****\n"); if ( options->Equil != NO ) printf("\tEQUIL time %8.2f\n", utime[EQUIL]); if ( options->RowPerm != NOROWPERM ) printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); if ( options->ColPerm != NATURAL ) printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); } MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); factflop = flopcnt; if ( !iam && options->Fact != FACTORED ) { printf("\tFACTOR time %8.2f\n", utime[FACT]); if ( utime[FACT] != 0.0 ) printf("\tFactor flops\t%e\tMflops \t%8.2f\n", flopcnt, flopcnt*1e-6/utime[FACT]); } MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); solveflop = flopcnt; if ( !iam ) { printf("\tSOLVE time %8.2f\n", utime[SOLVE]); if ( utime[SOLVE] != 0.0 ) printf("\tSolve flops\t%e\tMflops \t%8.2f\n", flopcnt, flopcnt*1e-6/utime[SOLVE]); if ( options->IterRefine != NOREFINE ) { printf("\tREFINEMENT time %8.2f\tSteps%8d\n\n", utime[REFINE], stat->RefineSteps); } printf("**************************************************\n"); } #if ( PROFlevel>=1 ) fflush(stdout); MPI_Barrier( grid->comm ); { int_t i, P = grid->nprow*grid->npcol; flops_t b, maxflop; if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); for (i = 0; i < P; ++i) { if ( iam == i) { printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]); fflush(stdout); } MPI_Barrier( grid->comm ); } if ( !iam ) printf("\n.. FACT ops distribution:\n"); for (i = 0; i < P; ++i) { if ( iam == i ) { printf("\t\t(%d)\t%e\n", iam, ops[FACT]); fflush(stdout); } MPI_Barrier( grid->comm ); } MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); if ( !iam ) { b = factflop/P/maxflop; printf("\tFACT load balance: %.2f\n", b); } if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); for (i = 0; i < P; ++i) { if ( iam == i ) { printf("\t\t%d\t%e\n", iam, ops[SOLVE]); fflush(stdout); } MPI_Barrier( grid->comm ); } MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm); if ( !iam ) { b = solveflop/P/maxflop; printf("\tSOLVE load balance: %.2f\n", b); } } #endif /* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ }