void Fill_Dist( void ) { FILE* inptr = fopen(input_file, "r"); if (mype == 0) { fscanf(inptr,"%d", &NumCities); printf("Number of cities: %d\n", NumCities); for( int i = 0 ; i<NumCities ; i++ ) { for( int j = 0 ; j<NumCities ; j++ ) { fscanf(inptr,"%d", &Dist[i*NumCities + j]); printf("%5d", Dist[i*NumCities+j] ); } printf("\n"); } } // Defining pSnyc array for collective operations for (int i=0; i < _SHMEM_BCAST_SYNC_SIZE; i++) pSync[i] = _SHMEM_SYNC_VALUE; // global operation, all processes must call it shmem_barrier_all(); shmem_broadcast32(&NumCities, &NumCities, 1, 0, 0, 0, NumProcs, pSync); assert(NumCities<=MAXCITIES); if(NumCities*NumCities > DIST_MAX_SIZE) fprintf(stderr, "Increase size of Dist array\n"); shmem_barrier_all(); shmem_broadcast32(Dist, Dist, NumCities*NumCities, 0, 0, 0, NumProcs, pSync); }
/* Performance test for shmem_broadcast32 */ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> #include <shmem.h> long pSyncA[_SHMEM_BCAST_SYNC_SIZE]; long pSyncB[_SHMEM_BCAST_SYNC_SIZE]; #define N_ELEMENTS 25600/*Data size chosen to be able to capture time required*/ int main(void) { int i,j,k; int *target; int *source; int me, npes; struct timeval start, end; long time_taken,start_time,end_time; start_pes(0); me = _my_pe(); npes = _num_pes(); source = (int *) shmalloc( N_ELEMENTS * sizeof(*source) ); time_taken = 0; for (i = 0; i < N_ELEMENTS; i += 1) { source[i] = i + 1; } target = (int *) shmalloc( N_ELEMENTS * sizeof(*target) ); for (i = 0; i < N_ELEMENTS; i += 1) { target[i] = -90; } for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1) { pSyncA[i] = _SHMEM_SYNC_VALUE; pSyncB[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); for(i=0;i<10000;i++){ gettimeofday(&start, NULL); start_time = (start.tv_sec * 1000000.0) + start.tv_usec; /* alternate between 2 pSync arrays to synchronize * consequent collectives of even and odd iterations */ if(i % 2) shmem_broadcast32(target, source, N_ELEMENTS, 0, 0, 0, npes, pSyncA); else shmem_broadcast32(target, source, N_ELEMENTS, 0, 0, 0, npes, pSyncB); gettimeofday(&end, NULL); end_time = (end.tv_sec * 1000000.0) + end.tv_usec; if(me==0){ time_taken = time_taken + (end_time - start_time); } } if(me == 0) printf("Time required for a broadcast of 100 Kbytes of data, with %d PEs is %ld microseconds\n",npes,time_taken/10000); shmem_barrier_all(); shfree(target); shfree(source); return 0; }
int osh_coll_tc3(const TE_NODE *node, int argc, const char *argv[]) { /* General initialisations */ int ii, numprocs, master; static int32_t source[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; static int32_t target[10]; int nlong; long *pSync = 0; int status = TC_PASS; UNREFERENCED_PARAMETER(node); UNREFERENCED_PARAMETER(argc); UNREFERENCED_PARAMETER(argv); numprocs = _num_pes(); master = 1; nlong = 10; if (numprocs == 1) { log_debug(OSH_TC, "Using more than 1 CPU makes the tests of this program more interesting\n"); return TC_SETUP_FAIL; } for (ii = 0; ii < nlong; ii++) target[ii] = 0; pSync = NULL; pSync = shmalloc(sizeof(long) *_SHMEM_COLLECT_SYNC_SIZE); for (ii=0; ii < _SHMEM_COLLECT_SYNC_SIZE; ii++) { pSync[ii] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Wait for all CPUs to initialize pSync */ /* Broadcast function */ if (_my_pe()%2 == 1) shmem_broadcast32(target, source, nlong, 0, 1, 1, numprocs/2, pSync); /* local master CPU = 0 */ if ((_my_pe()%2 == 1) && (_my_pe() != master)) for (ii = 0; ii < nlong; ii++) if (target[ii] != (ii + 1)) status = TC_FAIL; if (pSync) { shfree(pSync); } return status; }
void FORTRANIFY (shmem_broadcast32) (void *target, const void *source, int *nelems, int *PE_root, int *PE_start, int *logPE_stride, int *PE_size, int *pSync) { shmem_broadcast32 (target, source, *nelems, *PE_root, *PE_start, *logPE_stride, *PE_size, (long *) pSync); }
void bcast(int *target, int *src, int elements, int me, int npes, int loops) { int i; double start_time, elapsed_time; long *ps, *pSync, *pSync1; long total_bytes = loops * elements * sizeof(*src); pSync = (long*)shmem_malloc( 2 * sizeof(long) * _SHMEM_BCAST_SYNC_SIZE ); pSync1 = &pSync[_SHMEM_BCAST_SYNC_SIZE]; for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i++) { pSync[i] = pSync1[i] = _SHMEM_SYNC_VALUE; } if (me==0 && Verbose) { fprintf(stdout, "%s: %d loops of broadcast32(%ld bytes) over %d PEs: ", __FUNCTION__,loops,(elements*sizeof(*src)),npes); fflush(stdout); } shmem_barrier_all(); start_time = shmemx_wtime(); for(i = 0; i < loops; i++) { ps = (i & 1) ? pSync1 : pSync; shmem_broadcast32( target, src, elements, 0, 0, 0, npes, ps ); } elapsed_time = shmemx_wtime() - start_time; if (me==0 && Verbose) { printf("%7.3f secs\n", elapsed_time); printf(" %7.5f usecs / broadcast32(), %ld Kbytes @ %7.4f MB/sec\n\n", (elapsed_time/((double)loops*npes))*1000000.0, (total_bytes/1024), ((double)total_bytes/(1024.0*1024.0)) / elapsed_time ); } shmem_barrier_all(); shmem_free( pSync ); }
int main(int argc, char **argv) { int i,j; int my_pe,n_pes,PE_root; size_t max_elements,max_elements_bytes; int *srce_int,*targ_int,ans_int; long *srce_long,*targ_long,ans_long; float *srce_float,*targ_float,ans_float; double *srce_double,*targ_double,ans_double; shmem_init(); my_pe = shmem_my_pe(); n_pes = shmem_n_pes(); /* fail if trying to use only one processor */ if ( n_pes <= 1 ){ fprintf(stderr, "FAIL - test requires at least two PEs\n"); exit(1); } if(my_pe == 0) fprintf(stderr, "shmem_broadcast(%s) n_pes=%d\n", argv[0],n_pes); /* initialize the pSync arrays */ for (i=0; i < _SHMEM_BCAST_SYNC_SIZE; i++) { pSync1[i] = _SHMEM_SYNC_VALUE; pSync2[i] = _SHMEM_SYNC_VALUE; } shmem_barrier_all(); /* Wait for all PEs to initialize pSync1 & pSync2 */ PE_root=1; /* we'll broadcast from this PE */ /* shmem_broadcast32 test */ max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_broadcast32 max_elements = %d\n", max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); srce_float = shmem_malloc(max_elements_bytes); targ_float = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL) || (srce_float == NULL) || (targ_float == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_int[j] = (int)(my_pe+j); srce_float[j] = (float)(my_pe+j); targ_int[j] = (int)(100*my_pe+j); targ_float[j] = (float)(100*my_pe+j); } shmem_barrier_all(); for(i = 0; i < IMAX; i+=2) { /* i is even -- using int */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_int[j] = (int)(my_pe+i+j); } /* broadcast from PE_root to all PEs using pSync1 */ shmem_broadcast32(targ_int,srce_int,max_elements,PE_root,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_int= (int)(100*my_pe+j); } else { ans_int= (int)(PE_root+i+j); } if ( targ_int[j] != ans_int ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d ans_int=%d\n", my_pe,j,targ_int[j],ans_int); } /* i+1 is odd -- using float */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_float[j] = (float)(PE_root+i+1+j); } /* broadcast from PE_root to all PEs using pSync2 */ shmem_broadcast32(targ_float,srce_float,max_elements,PE_root,0,0,n_pes,pSync2); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_float= (float)(100*my_pe+j); } else { ans_float= (float)(PE_root+i+1+j); } if ( targ_float[j] != ans_float ) fprintf(stderr, "FAIL: PE [%d] targ_float[%d]=%10.0f ans_float=%10.0f\n", my_pe,j,targ_float[j],ans_float); } } shmem_free(srce_int); shmem_free(targ_int); shmem_free(srce_float); shmem_free(targ_float); /* shmem_broadcast64 test */ max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmem_broadcast64 max_elements = %d\n", max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); srce_double = shmem_malloc(max_elements_bytes); targ_double = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL) || (srce_double == NULL) || (targ_double == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_long[j] = (long)(my_pe+j); srce_double[j] = (double)(my_pe+j); targ_long[j] = (long)(100*my_pe+j); targ_double[j] = (double)(100*my_pe+j); } shmem_barrier_all(); for(i = 0; i < IMAX; i+=2) { /* i is even -- using long */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_long[j] = (long)(my_pe+i+j); } /* broadcast from PE_root to all PEs using pSync1 */ shmem_broadcast64(targ_long,srce_long,max_elements,PE_root,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_long= (long)(100*my_pe+j); } else { ans_long= (long)(PE_root+i+j); } if ( targ_long[j] != ans_long ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d ans_long=%d\n", my_pe,j,targ_long[j],ans_long); } /* i+1 is odd -- using double */ if (my_pe == PE_root) for(j = 0; j < max_elements; j++) { srce_double[j] = (double)(PE_root+i+1+j); } /* broadcast from PE_root to all PEs using pSync2 */ shmem_broadcast64(targ_double,srce_double,max_elements,PE_root,0,0,n_pes,pSync2); for(j = 0; j < max_elements; j++) { if (my_pe == PE_root) { ans_double= (double)(100*my_pe+j); } else { ans_double= (double)(PE_root+i+1+j); } if ( targ_double[j] != ans_double ) fprintf(stderr, "FAIL: PE [%d] targ_double[%d]=%10.0f ans_double=%10.0f\n", my_pe,j,targ_double[j],ans_double); } } shmem_free(srce_long); shmem_free(targ_long); shmem_free(srce_double); shmem_free(targ_double); #ifndef OPENSHMEM #ifdef SHMEM_C_GENERIC_32 /* shmemx_broadcast (GENERIC 32) test */ max_elements = (size_t) (MAX_SIZE / sizeof(int)); max_elements_bytes = (size_t) (sizeof(int)*max_elements); if(my_pe == 0) fprintf(stderr,"shmemx_broadcast (GENERIC 32) max_elements = %d\n", max_elements); srce_int = shmem_malloc(max_elements_bytes); targ_int = shmem_malloc(max_elements_bytes); if((srce_int == NULL) || (targ_int == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_int[j] = (int)(my_pe+j); targ_int[j] = (int)(2*my_pe+j); } shmem_barrier_all(); /* broadcast from PE 1 to all PEs */ shmemx_broadcast(targ_int,srce_int,max_elements,1,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == 1) { ans_int= (int)(j+2); } else { ans_int= (int)(j+1); } if ( targ_int[j] != ans_int ) fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d ans_int=%d\n", my_pe,j,targ_int[j],ans_int); } shmem_free(srce_int); shmem_free(targ_int); #else /* shmemx_broadcast (GENERIC 64) test */ max_elements = (size_t) (MAX_SIZE / sizeof(long)); max_elements_bytes = (size_t) (sizeof(long)*max_elements); if(my_pe == 0) fprintf(stderr,"shmemx_broadcast (GENERIC 64) max_elements = %d\n", max_elements); srce_long = shmem_malloc(max_elements_bytes); targ_long = shmem_malloc(max_elements_bytes); if((srce_long == NULL) || (targ_long == NULL)) shmalloc_error(); for(j = 0; j < max_elements; j++) { srce_long[j] = (long)(my_pe+j); targ_long[j] = (long)(2*my_pe+j); } shmem_barrier_all(); /* broadcast from PE 1 to all PEs */ shmemx_broadcast(targ_long,srce_long,max_elements,1,0,0,n_pes,pSync1); for(j = 0; j < max_elements; j++) { if (my_pe == 1) { ans_long = (long)(j+2); } else { ans_long = (long)(j+1); } if ( targ_long[j] != ans_long ) fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d ans_long=%d\n", my_pe,j,targ_long[j],ans_long); } shmem_free(srce_long); shmem_free(targ_long); #endif #endif #ifdef NEEDS_FINALIZE shmem_finalize(); #endif return 0; }
int main(int argc, char *argv[]) { int size, rank, world_rank, my_group; int num_lsms; // number of parallel LSMS instances int size_lsms; // number of atoms in a lsms instance int num_steps; // number of energy calculations int initial_steps; // number of steps before sampling starts int stepCount=0; // count the Monte Carlo steps executed double max_time; // maximum walltime for this run in seconds bool restrict_time = false; // was the maximum time specified? bool restrict_steps = false; // or the max. numer of steps? int align; // alignment of lsms_instances double magnetization; double energy_accumulator; // accumulates the enegy to calculate the mean int energies_accumulated; int new_peid,new_root; static int op,flag; double *evec,*r_values; evec=(double *)shmalloc(sizeof(double)*3*size_lsms); r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1))); energy_accumulator=0.0; energies_accumulated=0; double walltime_0,walltime; double restartWriteFrequency=30.0*60.0; double nextWriteTime=restartWriteFrequency; MPI_Comm local_comm; int *lsms_rank0; MPI_Status status; char prefix[40]; char i_lsms_name[64]; char gWL_in_name[64], gWL_out_name[64]; char mode_name[64]; char energy_calculation_name[64]; char stupid[37]; char step_out_name[64]; char wl_step_out_name[128]; char *wl_stepf=NULL; bool step_out_flag=false; std::ofstream step_out_file; typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode; typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension; EvecGenerationMode evec_generation_mode = Constant; SecondDimension second_dimension = MagneticMoment; double ev0[3]; bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step. bool generator_needs_moment=false; typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode; EnergyCalculationMode energyCalculationMode = OneStepEnergy; int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy) ev0[0]=ev0[1]=0.0; ev0[2]=1.0; // size has to be align + size_lsms*num_lsms align=1; num_lsms=1; size_lsms=-1; my_group=-1; num_steps=1; initial_steps=0; sprintf(i_lsms_name,"i_lsms"); gWL_in_name[0]=gWL_out_name[0]=0; mode_name[0]=0; energy_calculation_name[0]=0; // check command line arguments for(int i=0; i<argc; i++) { if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]); if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]); if(!strcmp("-align",argv[i])) align=atoi(argv[++i]); if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;} if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;} if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64); if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;} if(!strcmp("-step_out",argv[i])) {strncpy(step_out_name,argv[++i],64); step_out_flag=true; return_moments_flag=true;} if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64); if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64); if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64); if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64); } if(!(restrict_steps || restrict_time)) restrict_steps=true; if(mode_name[0]!=0) { if(!strcmp("constant",mode_name)) evec_generation_mode = Constant; if(!strcmp("random",mode_name)) evec_generation_mode = Random; if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d; if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing; if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d; if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;} if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;} if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;} if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;} } if(energy_calculation_name[0]!=0) { if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; } if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; } if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; } } #ifdef USE_PAPI #define NUM_PAPI_EVENTS 4 int hw_counters = PAPI_num_counters(); if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS; int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS}; char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS", "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE", "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"}; // "RETIRED_INSTRUCTIONS", // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2", // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1", // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1" // get events from names: for(int i=0; i<NUM_PAPI_EVENTS; i++) { if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK) { // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]), // std::cerr,parameters.myrankWorld); // printline("Skipping all following events", // std::cerr,parameters.myrankWorld); if(hw_counters>i) hw_counters=i; } } long long papi_values[NUM_PAPI_EVENTS+4]; // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld); if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS; long long papi_real_cyc_0 = PAPI_get_real_cyc(); long long papi_real_usec_0 = PAPI_get_real_usec(); long long papi_virt_cyc_0 = PAPI_get_virt_cyc(); long long papi_virt_usec_0 = PAPI_get_virt_usec(); PAPI_start_counters(papi_events,hw_counters); #endif lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1)); // initialize MPI: MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); world_rank=rank; MPI_Comm_size(MPI_COMM_WORLD, &size); walltime_0 = get_rtc(); #ifndef SVN_REV #define SVN_REV "unknown" #endif // make sure 'return_moments_flag' is set correctly switch(evec_generation_mode) { case Constant : break; case Random : break; case WangLandau_1d : return_moments_flag = true; generator_needs_moment = true; break; case ExhaustiveIsing : break; case WangLandau_2d : return_moments_flag = true; generator_needs_moment = true; break; default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } if(rank==0) { std::cout<<"LSMS_3"<<std::endl; std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl; #ifdef USE_PAPI std::cout<<" Using Papi counters"<<std::endl<<std::endl; #endif std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n"; std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl; std::cout<<" LSMS Energy calculated using "; switch(energyCalculationMode) { case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break; case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break; case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break; default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1); } if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl; if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n"; std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl; switch(evec_generation_mode) { case Constant : std::cout<<" Constant moments direction along " <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl; break; case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl; break; case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl; // return_moments_flag = true; // generator_needs_moment = true; break; case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break; case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, "; switch(second_dimension) { case MagneticMoment : std::cout<<"magnitude of magnetization)"; break; case MagneticMomentX : std::cout<<"x component of magnetization)"; break; case MagneticMomentY : std::cout<<"y component of magnetization)"; break; case MagneticMomentZ : std::cout<<"z component of magnetization)"; break; } std::cout<<std::endl; // return_moments_flag = true; // generator_needs_moment = true; break; default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl; std::cout<<std::endl; if(step_out_flag && (evec_generation_mode==WangLandau_1d)) { // step_out_flag=false; snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name); wl_stepf=wl_step_out_name; } if(step_out_flag) { step_out_file.open(step_out_name); step_out_file<<"#"; for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i]; step_out_file<<std::endl<<size_lsms<<std::endl; } } if(generator_needs_moment) return_moments_flag=true; if(num_lsms==1) { SHMEM_activeset local_comm; local_comm.rank=shmem_my_pe(); local_comm.size=shmem_n_pes(); local_comm.start_pe=0; local_comm.logPE_stride=0; LSMS lsms_calc(local_comm,i_lsms_name,"1_"); if(rank==0) { std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n"; std::cout<<" LSMS version = "<<lsms_calc.version()<<std::endl; } if(energyCalculationMode==OneStepEnergy) std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl; else if(energyCalculationMode==MultiStepEnergy) std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl; else if(energyCalculationMode==ScfEnergy) std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl; else { printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n"); // MPI_Abort(MPI_COMM_WORLD,5); exit(5); } } else { // build the communicators //int color=MPI_UNDEFINED; //Assuming user passes a power of two while using "-align" int s = align; int comm_size=(size-align)/num_lsms; int world_rank; for(int i=0; i<num_lsms; i++) { if((world_rank>=s) && (world_rank<s+comm_size)) { my_group=i; //color=i; new_peid=world_rank-s; new_root=s; } lsms_rank0[i]=s; s+=comm_size; } if(world_rank==0){ //color=num_lsms; new_peid=0; comm_size=1; new_root=0; } //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm); SHMEM_activeset local_comm; local_comm.rank=new_peid; local_comm.size=comm_size; local_comm.start_pe=new_root; local_comm.logPE_stride=0; std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl; snprintf(prefix,38,"Group %4d: ",my_group); // now we get ready to do some calculations... if(my_group>=0) { double energy; double band_energy; int static i_values[10]; double static r_values[10]; static int op; //MPI_Comm_rank(local_comm, &rank); rank = local_comm.rank; snprintf(prefix,38,"%d_",my_group); // to use the ramdisk on jaguarpf: // snprintf(prefix,38,"/tmp/ompi/%d_",my_group); LSMS lsms_calc(local_comm,i_lsms_name,prefix); snprintf(prefix,38,"Group %4d: ",my_group); if(rank==0 && my_group==0) { std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n"; std::cout<<prefix<<" LSMS version = "<<lsms_calc.version()<<std::endl; } // wait for commands from master bool finished=false; while(!finished) { if(rank==0) { //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status); //op =status.MPI_TAG; if (lsms_rank0[0]==world_rank) shmem_barrier(0, lsms_rank0[0], 2, pSync1); } //MPI_Bcast(&op,1,MPI_INT,0,local_comm); shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); /* recognized opcodes: 5: calculate energy recognized energy calculation modes: OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef) use only if the Fermi energy will not change due to MC steps! The only method available in LSMS_1.9 MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy This should be the new default method. If the Fermi energy doesn't change multiStepEnergy only performs one step and should be equivalent to oneStepEnergy The tolerance for Ef convergence can be set with LSMS::setEfTol(Real). The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6). The maximum number of steps is read from the LSMS input file 'nscf' parameter. ScfEnergy : this will calculate the selfconsistent total energy. The maximum number of steps is read from the LSMS input file 'nscf' parameter. NOT IMPLEMENTED YET!!! 10: get number of sites */ if(op==5) { lsms_calc.setEvec(evec); if(energyCalculationMode==OneStepEnergy) energy=lsms_calc.oneStepEnergy(&band_energy); else if(energyCalculationMode==MultiStepEnergy) band_energy=energy=lsms_calc.multiStepEnergy(); else if(energyCalculationMode==ScfEnergy) energy=lsms_calc.scfEnergy(&band_energy); else { printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n"); //MPI_Abort(MPI_COMM_WORLD,5); exit(5); } r_values[0]=energy; r_values[1]=band_energy; if(return_moments_flag) { lsms_calc.getMag(&r_values[R_VALUE_OFFSET]); } if(rank==0) { if(return_moments_flag) { //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD); shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0); } else { //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD); shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0); } shmem_fence(); shmem_int_swap(&flag, world_rank, 0); } } else if(op==10) { i_values[0]=lsms_calc.numSpins(); //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD); shmem_int_put(i_values, i_values, 10, 0); } else { // printf("world rank %d: recieved exit\n",world_rank); finished=true; } } shfree(evec); //shfree(r_values); } else if(world_rank==0) { int running; double **evecs; //double *r_values; //int i_values[10]; int *init_steps; int total_init_steps; bool accepted; char *wl_inf=NULL; char *wl_outf=NULL; if(gWL_in_name) wl_inf=gWL_in_name; if(gWL_out_name) wl_outf=gWL_out_name; EvecGenerator *generator; /* // get number of spins from first LSMS instance // temp r_values: r_values=(double *)malloc(sizeof(double)*10); MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD); free(r_values); MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status); if(i_values[0]!=size_lsms) { printf("Size specified for Wang-Landau and in LSMS input file don't match!\n"); size_lsms=i_values[0]; } */ evecs=(double **)shmalloc(sizeof(double *)*num_lsms); init_steps=(int *)shmalloc(sizeof(int)*num_lsms); for(int i=0; i<num_lsms; i++) { evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms); init_steps[i]=initial_steps; } total_init_steps=num_lsms*initial_steps; // Initialize the correct evec generator switch(evec_generation_mode) { case Random : generator = new RandomEvecGenerator(size_lsms); break; case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms); break; //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms, // evecs, wl_inf, wl_outf, wl_stepf); case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms, evecs, wl_inf, wl_outf, wl_stepf); break; case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms, evecs, wl_inf, wl_outf); break; //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms, // evecs, wl_inf, wl_outf, wl_stepf); case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms, evecs, wl_inf, wl_outf, wl_stepf); break; default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } for(int i=0; i<num_lsms; i++) { generator->initializeEvec(i,evecs[i]); } std::cout<<"This is the master node\n"; // issue initial commands to all LSMS instances running=0; bool more_work=true; if(total_init_steps>0) { for(int i=0; i<num_lsms; i++) { std::cout<<"starting initial calculation in group "<<i<<std::endl; //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD); shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]); shmem_int_p(&op, 5, lsms_rank0[i]); shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; } shmem_barrier(0, lsms_rank0[0], 2, pSync1); // first deal with the initial steps: while(running>0) { //if(return_moments_flag) // MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); //else // MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); shmem_int_wait(&flag,-1); running--; // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl; // std::cout<<" band energy E_band="<<r_values[1]<<std::endl; if(total_init_steps>0) { //int r_group=(status.MPI_SOURCE-align)/comm_size; int r_group=(flag-align)/comm_size; std::cout<<"starting additional calculation in group "<<r_group<<std::endl; if(init_steps[r_group]>0) { more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex])); init_steps[r_group]--; total_init_steps--; } //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD); shmem_double_put(r_values, evecs[r_group], 3*size_lsms, lsms_rank0[r_group]); //TODO check this shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps && num_steps<=0) more_work=false; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; walltime = get_rtc() - walltime_0; if(restrict_time && walltime>=max_time) more_work=false; if(restrict_time) std::cout<<" "<<max_time-walltime<<" seconds remaining\n"; } } } more_work=true; running=0; for(int i=0; i<num_lsms; i++) { std::cout<<"starting main calculation in group "<<i<<std::endl; //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD); shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]); shmem_int_p(&op, 5, lsms_rank0[i]); shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; } shmem_barrier(0, lsms_rank0[0], 2, pSync1); generator->startSampling(); // wait for results and issue new commands or wind down while(running>0) { //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); shmem_int_wait(&flag,-1); running--; std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl; std::cout<<" band energy E_band="<<r_values[1]<<std::endl; // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE); energy_accumulator+=r_values[0]; energies_accumulated++; if(more_work) { int r_group=(status.MPI_SOURCE-align)/comm_size; std::cout<<"starting additional calculation in group "<<r_group<<std::endl; if(generator_needs_moment) { double m0,m1,m2; m0=0.0; m1=0.0; m2=0.0; for(int i=0; i<3*size_lsms; i+=3) { m0+=r_values[R_VALUE_OFFSET+i]; m1+=r_values[R_VALUE_OFFSET+i+1]; m2+=r_values[R_VALUE_OFFSET+i+2]; } switch(second_dimension) { case MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break; case MagneticMomentX : magnetization=m0; break; case MagneticMomentY : magnetization=m1; break; case MagneticMomentZ : magnetization=m2; break; } if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted)) more_work=false; } else { if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false; } //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD); shmem_double_put(r_values, evecs[r_group], 3*size_lsms, lsms_rank0[r_group]); //TODO check this shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps && num_steps<=0) more_work=false; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; walltime = get_rtc() - walltime_0; if(restrict_time && walltime>=max_time) more_work=false; if(restrict_time) std::cout<<" "<<max_time-walltime<<" seconds remaining\n"; } else { // send an exit message to this instance of LSMS int r_group=(status.MPI_SOURCE-align)/comm_size; MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD); } if(step_out_flag && accepted) { step_out_file<<"# iteration "<<energies_accumulated<<std::endl; step_out_file.precision(15); step_out_file<<energies_accumulated<<std::endl; step_out_file<<r_values[0]<<" "<<r_values[1]<<std::endl; for(int j=0; j<3*size_lsms; j+=3) { step_out_file<<r_values[j+R_VALUE_OFFSET]<<" "<<r_values[j+R_VALUE_OFFSET+1] <<" "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl; } } // write restart file every restartWriteFrequency seconds if(walltime>nextWriteTime) { generator->writeState("WLrestart.jsn"); nextWriteTime+=restartWriteFrequency; } } generator->writeState("WLrestart.jsn"); /* if(evec_generation_mode==WangLandau_1d) (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state"); if(evec_generation_mode==ExhaustiveIsing) (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state"); */ for(int i=0; i<num_lsms; i++) free(evecs[i]); shfree(evecs); //shfree(r_values); } } if(world_rank==0) { if(step_out_flag) { step_out_file<<"# end\n-1\n" <<energy_accumulator/double(energies_accumulated)<<std::endl; step_out_file.close(); } std::cout<<"Finished all scheduled calculations. Freeing resources.\n"; std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n"; } if(num_lsms>1) { // make sure averyone arrives here: MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD); if(world_rank==0) { MPI_Comm_free(&local_comm); } else if(my_group>=0) { MPI_Comm_free(&local_comm); } } if(world_rank==0) { double walltime = get_rtc() - walltime_0; std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n"; std::cout<<" Monte-Carlo steps / walltime = " <<double(stepCount)/walltime<<"/sec\n"; } #ifdef USE_PAPI PAPI_stop_counters(papi_values,hw_counters); papi_values[hw_counters ] = PAPI_get_real_cyc()-papi_real_cyc_0; papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0; papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0; papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0; long long accumulated_counters[NUM_PAPI_EVENTS+4]; /* for(int i=0; i<hw_counters; i++) { printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]), std::cout,parameters.myrankWorld); } printline("PAPI real cycles : "+ttos(papi_values[hw_counters]), std::cout,parameters.myrankWorld); printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]), std::cout,parameters.myrankWorld); printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]), std::cout,parameters.myrankWorld); printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]), std::cout,parameters.myrankWorld); */ //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4, // MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD); shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4, comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2); if(world_rank==0) { for(int i=0; i<hw_counters; i++) { std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n"; } std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n"; std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n"; double gflops_papi = ((double)accumulated_counters[1])/ (1000.0*(double)papi_values[hw_counters+1]); double gflops_hw_double = ((double)accumulated_counters[2])/ (1000.0*(double)papi_values[hw_counters+1]); double gflops_hw_single = ((double)accumulated_counters[3])/ (1000.0*(double)papi_values[hw_counters+1]); double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]); std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n"; std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n"; std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n"; std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n"; } #endif //MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int i = 0, rank, size; int skip, numprocs; static double avg_time = 0.0, max_time = 0.0, min_time = 0.0; static double latency = 0.0; int64_t t_start = 0, t_stop = 0, timer=0; char *buffer=NULL; int max_msg_size = 1048576, full = 0; int t; for ( t = 0; t < _SHMEM_BCAST_SYNC_SIZE; t += 1) pSyncBcast1[t] = _SHMEM_SYNC_VALUE; for ( t = 0; t < _SHMEM_BCAST_SYNC_SIZE; t += 1) pSyncBcast2[t] = _SHMEM_SYNC_VALUE; for ( t = 0; t < _SHMEM_REDUCE_SYNC_SIZE; t += 1) pSyncRed1[t] = _SHMEM_SYNC_VALUE; for ( t = 0; t < _SHMEM_REDUCE_SYNC_SIZE; t += 1) pSyncRed2[t] = _SHMEM_SYNC_VALUE; start_pes(0); rank = _my_pe(); numprocs = _num_pes(); if (process_args(argc, argv, rank, &max_msg_size, &full)) { return 0; } if(numprocs < 2) { if(rank == 0) { fprintf(stderr, "This test requires at least two processes\n"); } return -1; } print_header(rank, full); buffer = shmalloc(max_msg_size * sizeof(char)); if(NULL == buffer) { fprintf(stderr, "malloc failed.\n"); exit(1); } memset(buffer,1, max_msg_size); for(size=1; size <=max_msg_size/sizeof(uint32_t); size *= 2) { if(size > LARGE_MESSAGE_SIZE) { skip = SKIP_LARGE; iterations = iterations_large; } else { skip = SKIP; } timer=0; for(i=0; i < iterations + skip ; i++) { t_start = TIME(); if(i%2) shmem_broadcast32(buffer, buffer, size, 0, 0, 0, numprocs, pSyncBcast1); else shmem_broadcast32(buffer, buffer, size, 0, 0, 0, numprocs, pSyncBcast2); t_stop = TIME(); if(i>=skip){ timer+=t_stop-t_start; } shmem_barrier_all(); } shmem_barrier_all(); latency = (1.0 * timer) / iterations; shmem_double_min_to_all(&min_time, &latency, 1, 0, 0, numprocs, pWrk1, pSyncRed1); shmem_double_max_to_all(&max_time, &latency, 1, 0, 0, numprocs, pWrk2, pSyncRed2); shmem_double_sum_to_all(&avg_time, &latency, 1, 0, 0, numprocs, pWrk1, pSyncRed1); avg_time = avg_time/numprocs; print_data(rank, full, size*sizeof(uint32_t), avg_time, min_time, max_time, iterations); } shfree(buffer); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { int i; shmem_init(); rank = shmem_my_pe(); world_size = shmem_n_pes(); /* root handles arguments and bcasts answers */ if (0 == rank) { int ch; while (start_err != 1 && (ch = getopt(argc, argv, "p:i:m:s:c:n:oh")) != -1) { switch (ch) { case 'p': npeers = atoi(optarg); break; case 'i': niters = atoi(optarg); break; case 'm': nmsgs = atoi(optarg); break; case 's': nbytes = atoi(optarg); break; case 'c': cache_size = atoi(optarg) / sizeof(int); break; case 'n': ppn = atoi(optarg); break; case 'o': machine_output = 1; break; case 'h': case '?': default: start_err = 1; usage(); } } /* sanity check */ if (start_err != 1) { #if 0 if (world_size < 3) { fprintf(stderr, "Error: At least three processes are required\n"); start_err = 1; } else #endif if (world_size <= npeers) { fprintf(stderr, "Error: job size (%d) <= number of peers (%d)\n", world_size, npeers); start_err = 77; } else if (ppn < 1) { fprintf(stderr, "Error: must specify process per node (-n #)\n"); start_err = 77; } else if (world_size / ppn <= npeers) { fprintf(stderr, "Error: node count <= number of peers\n"); start_err = 77; } } } for (i = 0; i < SHMEM_BCAST_SYNC_SIZE; i++) bcast_pSync[i] = SHMEM_SYNC_VALUE; for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) barrier_pSync[i] = SHMEM_SYNC_VALUE; for (i = 0; i < SHMEM_REDUCE_SYNC_SIZE; i++) reduce_pSync[i] = SHMEM_SYNC_VALUE; for (i = 0; i < SHMEM_REDUCE_MIN_WRKDATA_SIZE; i++) reduce_pWrk[i] = SHMEM_SYNC_VALUE; shmem_barrier_all(); /* broadcast results */ printf("%d: psync: 0x%lu\n", rank, (unsigned long) bcast_pSync); shmem_broadcast32(&start_err, &start_err, 1, 0, 0, 0, world_size, bcast_pSync); if (0 != start_err) { exit(start_err); } shmem_barrier_all(); shmem_broadcast32(&npeers, &npeers, 1, 0, 0, 0, world_size, bcast_pSync); shmem_barrier_all(); shmem_broadcast32(&niters, &niters, 1, 0, 0, 0, world_size, bcast_pSync); shmem_barrier_all(); shmem_broadcast32(&nmsgs, &nmsgs, 1, 0, 0, 0, world_size, bcast_pSync); shmem_barrier_all(); shmem_broadcast32(&nbytes, &nbytes, 1, 0, 0, 0, world_size, bcast_pSync); shmem_barrier_all(); shmem_broadcast32(&cache_size, &cache_size, 1, 0, 0, 0, world_size, bcast_pSync); shmem_barrier_all(); shmem_broadcast32(&ppn, &ppn, 1, 0, 0, 0, world_size, bcast_pSync); shmem_barrier_all(); if (0 == rank) { if (!machine_output) { printf("job size: %d\n", world_size); printf("npeers: %d\n", npeers); printf("niters: %d\n", niters); printf("nmsgs: %d\n", nmsgs); printf("nbytes: %d\n", nbytes); printf("cache size: %d\n", cache_size * (int)sizeof(int)); printf("ppn: %d\n", ppn); } else { printf("%d %d %d %d %d %d %d ", world_size, npeers, niters, nmsgs, nbytes, cache_size * (int)sizeof(int), ppn); } } /* allocate buffers */ send_peers = malloc(sizeof(int) * npeers); if (NULL == send_peers) abort_app("malloc"); recv_peers = malloc(sizeof(int) * npeers); if (NULL == recv_peers) abort_app("malloc"); cache_buf = malloc(sizeof(int) * cache_size); if (NULL == cache_buf) abort_app("malloc"); send_buf = malloc(npeers * nmsgs * nbytes); if (NULL == send_buf) abort_app("malloc"); memset(send_buf, 1, npeers * nmsgs * nbytes); recv_buf = shmem_malloc(npeers * nmsgs * nbytes); if (NULL == recv_buf) abort_app("malloc"); memset(recv_buf, 0, npeers * nmsgs * nbytes); /* calculate peers */ for (i = 0 ; i < npeers ; ++i) { if (i < npeers / 2) { send_peers[i] = (rank + world_size + ((i - npeers / 2) * ppn)) % world_size; } else { send_peers[i] = (rank + world_size + ((i - npeers / 2 + 1) * ppn)) % world_size; } } if (npeers % 2 == 0) { /* even */ for (i = 0 ; i < npeers ; ++i) { if (i < npeers / 2) { recv_peers[i] = (rank + world_size + ((i - npeers / 2) *ppn)) % world_size; } else { recv_peers[i] = (rank + world_size + ((i - npeers / 2 + 1) * ppn)) % world_size; } } } else { /* odd */ for (i = 0 ; i < npeers ; ++i) { if (i < npeers / 2 + 1) { recv_peers[i] = (rank + world_size + ((i - npeers / 2 - 1) * ppn)) % world_size; } else { recv_peers[i] = (rank + world_size + ((i - npeers / 2) * ppn)) % world_size; } } } /* BWB: FIX ME: trash the free lists / malloc here */ /* sync, although tests will do this on their own (in theory) */ shmem_barrier_all(); /* run tests */ test_one_way(); test_same_direction(); test_prepost(); test_allstart(); if (rank == 0 && machine_output) printf("\n"); /* done */ shmem_finalize(); return 0; }
int main(int argc, char **argv) { int i,ps,ps_cnt=2; int *target; int *source; int me, npes, elements=N_ELEMENTS, loops=DFLT_LOOPS; char *pgm; double start_time, time_taken; start_pes(0); me = _my_pe(); npes = _num_pes(); if ((pgm=strrchr(argv[0],'/'))) pgm++; else pgm = argv[0]; while ((i = getopt (argc, argv, "hve:l:p:s")) != EOF) { switch (i) { case 'v': Verbose++; break; case 'e': if ((elements = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad elements count %d\n",elements); return 1; } break; case 'l': if ((loops = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad loop count %d\n",loops); return 1; } break; case 'p': if ((ps_cnt = atoi_scaled(optarg)) <= 0) { fprintf(stderr,"ERR: Bad pSync[] elements %d\n",loops); return 1; } break; case 's': Serialize++; break; case 'h': if (me == 0) usage(pgm); return 0; default: if (me == 0) { fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i); usage(pgm); } return 1; } } ps_cnt *= _SHMEM_BCAST_SYNC_SIZE; pSync = shmalloc( ps_cnt * sizeof(long) ); for (i = 0; i < ps_cnt; i++) pSync[i] = _SHMEM_SYNC_VALUE; source = (int *) shmalloc( elements * sizeof(*source) ); target = (int *) shmalloc( elements * sizeof(*target) ); for (i = 0; i < elements; i += 1) { source[i] = i + 1; target[i] = -90; } if (me==0 && Verbose) fprintf(stderr,"ps_cnt %d loops %d nElems %d\n", ps_cnt,loops,elements); shmem_barrier_all(); for(time_taken = 0.0, ps = i = 0; i < loops; i++) { start_time = shmem_wtime(); shmem_broadcast32(target, source, elements, 0, 0, 0, npes, &pSync[ps]); if (Serialize) shmem_barrier_all(); time_taken += (shmem_wtime() - start_time); if (ps_cnt > 1 ) { ps += _SHMEM_BCAST_SYNC_SIZE; if ( ps >= ps_cnt ) ps = 0; } } if(me == 0 && Verbose) { printf("%d loops of Broadcast32(%ld bytes) over %d PEs: %7.3f secs\n", loops, (elements*sizeof(*source)), npes, time_taken); elements = (elements * loops * sizeof(*source)) / (1024*1024); printf(" %7.5f secs per broadcast() @ %7.4f MB/sec\n", (time_taken/(double)loops), ((double)elements / time_taken) ); } if (Verbose > 1) fprintf(stderr,"[%d] pre B1\n",me); shmem_barrier_all(); if (Verbose > 1) fprintf(stderr,"[%d] post B1\n",me); shfree(pSync); shfree(target); shfree(source); return 0; }
int main (int argc, char *argv[]) { int myid, numprocs, i; double h, sum, x; struct timeval startwtime, endwtime; start_pes (0); numprocs = _num_pes (); myid = _my_pe (); if (myid == 0) { if (argc > 1) n = atoi (argv[1]); /* # rectangles on command line */ else n = 10000; /* default # of rectangles */ gettimeofday (&startwtime, NULL); } /* initialize sync array */ for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1) pSync[i] = _SHMEM_SYNC_VALUE; shmem_barrier_all (); /* send "n" out to everyone */ shmem_broadcast32 (&n, &n, 1, 0, 0, 0, numprocs, pSync); /* do partial computation */ h = 1.0 / (double) n; sum = 0.0; /* A slightly better approach starts from large i and works back */ for (i = myid + 1; i <= n; i += numprocs) { x = h * ((double) i - 0.5); sum += f (x); } mypi = h * sum; /* wait for everyone to finish */ shmem_barrier_all (); /* add up partial pi computations into PI */ shmem_double_sum_to_all (&pi, &mypi, 1, 0, 0, numprocs, pWrk, pSync); /* "master" PE summarizes */ if (myid == 0) { double elapsed; gettimeofday (&endwtime, NULL); elapsed = (endwtime.tv_sec - startwtime.tv_sec) * 1000.0; /* sec to ms */ elapsed += (endwtime.tv_usec - startwtime.tv_usec) / 1000.0; /* us to ms */ printf ("pi is approximately %.16f, Error is %.16f\n", pi, fabs (pi - PI25DT)); printf ("run time = %f ms\n", elapsed); fflush (stdout); } return 0; }
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ int Block_size; /* size of a single block */ int Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ int order; /* order of overall matrix */ int bufferCount; /* number of input buffers */ int targetBuffer; /* buffer with which to communicate */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ long i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double **Work_in_p; /* workspace for the transpose function */ double *Work_out_p; /* workspace for the transpose function */ double epsilon = 1.e-8; /* error tolerance */ double avgtime; /* timing parameters */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk; /* work space for SHMEM collectives */ double *local_trans_time, *trans_time; /* timing parameters */ double *abserr, *abserr_tot; /* local and aggregate error */ int *send_flag, *recv_flag; /* synchronization flags */ int *arguments; /* command line arguments */ /********************************************************************* ** Initialize the SHMEM environment *********************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM matrix transpose: B = A^T\n"); } // initialize sync variables for error checks pSync_bcast = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE); local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); arguments = (int *) prk_shmem_align(prk_get_alignment(),4*sizeof(int)); abserr = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double)); abserr_tot = abserr + 1; if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time || !trans_time || !arguments || !abserr) { printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID); error = 1; goto ENDOFTESTS; } for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { if (argc != 4 && argc != 5){ printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); arguments[1]=order; if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } bufferCount = atoi(*++argv); arguments[2]=bufferCount; if (Num_procs > 1) { if ((bufferCount < 1) || (bufferCount >= Num_procs)) { printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount); error = 1; goto ENDOFTESTS; } } if (argc == 5) Tile_order = atoi(*++argv); arguments[3]=Tile_order; ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); printf("Number of buffers = %d\n", bufferCount); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); } shmem_barrier_all(); /* Broadcast input data to all ranks */ shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; order=arguments[1]; bufferCount=arguments[2]; Tile_order=arguments[3]; shmem_barrier_all(); prk_shmem_free(arguments); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_in_p = (double**)prk_malloc(bufferCount*sizeof(double)); Work_out_p = (double *) prk_malloc(Block_size*sizeof(double)); recv_flag = (int*) prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int)); if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){ printf(" Error allocating space for work or flags on node %d\n",my_ID); error = 1; } if (bufferCount < (Num_procs - 1)) { send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int)); if (send_flag == NULL) { printf("Error allocating space for flags on node %d\n", my_ID); error = 1; } } bail_out(error); for(i=0;i<bufferCount;i++) { Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double)); if (Work_in_p[i] == NULL) { printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } if (bufferCount < (Num_procs - 1)) { for(i=0;i<(Num_procs-1);i++) send_flag[i]=0; } for(i=0;i<bufferCount;i++) recv_flag[i]=0; } /* Fill the original column matrices */ istart = 0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } shmem_barrier_all(); if (bufferCount < (Num_procs - 1)) { if (Num_procs > 1) { for ( i = 0; i < bufferCount; i++) { recv_from = (my_ID + i + 1)%Num_procs; shmem_int_inc(&send_flag[i], recv_from); } } } shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_trans_time[0] = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } for (phase=1; phase<Num_procs; phase++){ recv_from = (my_ID + phase )%Num_procs; send_to = (my_ID - phase + Num_procs)%Num_procs; targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); A(it,jt) += 1.0; } } if (bufferCount < (Num_procs - 1)) shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1); shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to); shmem_fence(); shmem_int_inc(&recv_flag[targetBuffer], send_to); i = (iter * (Num_procs - 1) + phase) / bufferCount; if ((iter * (Num_procs - 1) + phase) % bufferCount) i++; shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) for (i=0; i<Block_order; i++) B(i,j) += Work_in(targetBuffer, i,j); if (bufferCount < (Num_procs - 1)) { if ((phase + bufferCount) < Num_procs) recv_from = (my_ID + phase + bufferCount) % Num_procs; else recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs; shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from); } } /* end of phase loop */ } /* end of iterations */ local_trans_time[0] = wtime() - local_trans_time[0]; shmem_barrier_all(); shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce); abserr[0] = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit)); } shmem_barrier_all(); shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce); if (my_ID == root) { if (abserr_tot[0] <= epsilon) { printf("Solution validates\n"); avgtime = trans_time[0]/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr[0]); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon); error = 1; } } bail_out(error); if (Num_procs>1) { if (bufferCount < (Num_procs - 1)) prk_shmem_free(send_flag); prk_shmem_free(recv_flag); prk_free(Work_out_p); for(i=0;i<bufferCount;i++) prk_shmem_free(Work_in_p[i]); prk_free(Work_in_p); } prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk); prk_shmem_finalize(); exit(EXIT_SUCCESS); } /* end of main */
void communicateParameters(LSMSCommunication &comm, LSMSSystemParameters &lsms, CrystalParameters &crystal, MixingParameters &mix) { int const s=sizeof(LSMSSystemParameters)+9*sizeof(Real)+sizeof(int)+10 +sizeof(MixingParameters)+5*sizeof(int); int rem=0,ele=0; int tot_bufsize=s; rem=s%32; ele=s/32; if (rem!=0) { tot_bufsize=s-rem+32; ele++; } // TODO fine-tune this size tot_bufsize=65536; char* buf=(char*)shmalloc(tot_bufsize); int pos=0; int sec_id; if(comm.comm.rank==0) { //MPI_Pack(lsms.systemid,80,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(lsms.title,80,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(lsms.potential_file_in,128,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(lsms.potential_file_out,128,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.pot_in_type,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.pot_out_type,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.num_atoms,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nspin,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nrel_rel,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nrelc,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nrelv,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.n_spin_cant,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.n_spin_pola,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.mtasa,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.fixRMT,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.nscf,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.writeSteps,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.clight,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.grid,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.npts,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.ebot,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.etop,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.eibot,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.eitop,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.energyContour.maxGroupSize,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.mixing,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.alphaDV,1,MPI_DOUBLE,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.iprint,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.print_node,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.default_iprint,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.istop,32,MPI_CHAR,buf,s,&pos,comm.comm); //MPI_Pack(&lsms.global.GPUThreads,32,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&crystal.num_types,1,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&crystal.bravais(0,0),9,MPI_DOUBLE,buf,s,&pos,comm.comm); //************ MemCpying *************** memcpy(&buf[pos],&lsms.systemid,80*char_size); pos = pos+80*char_size; memcpy(&buf[pos],&lsms.title,80*char_size); pos = pos+80*char_size; memcpy(&buf[pos],&lsms.potential_file_in,128*char_size); pos = pos+128*char_size; memcpy(&buf[pos],&lsms.potential_file_out,128*char_size); pos = pos+128*char_size; memcpy(&buf[pos],&lsms.pot_in_type,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.pot_out_type ,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.num_atoms,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nspin,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nrel_rel,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nrelc,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nrelv,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.n_spin_cant,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.n_spin_pola,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.mtasa,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.fixRMT,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.nscf,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.writeSteps,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.clight,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.grid,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.energyContour.npts,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.energyContour.ebot,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.etop,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.eibot,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.eitop,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.energyContour.maxGroupSize,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.mixing,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.alphaDV,double_size); pos = pos+double_size; memcpy(&buf[pos],&lsms.global.iprint,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.global.print_node,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.global.default_iprint,int_size); pos = pos+int_size; memcpy(&buf[pos],&lsms.global.istop,32*char_size); pos = pos+32*char_size; memcpy(&buf[pos],&lsms.global.GPUThreads,32*int_size); pos = pos+32*int_size; memcpy(&buf[pos],&crystal.num_types,int_size); pos = pos+int_size; memcpy(&buf[pos],&crystal.bravais(0,0),9*double_size); pos = pos+9*double_size; // MixingParameters // MPI_CXX_BOOL is not always available // MPI_Pack(&mix.quantity[0],mix.numQuantities,MPI_CXX_BOOL,buf,s,&pos,comm.comm); // copy to temporary int array and send this int tmpQuantity[mix.numQuantities]; for(int i=0; i<mix.numQuantities; i++) if(mix.quantity[i]) tmpQuantity[i] = 1; else tmpQuantity[i] = 0; //MPI_Pack(&tmpQuantity[0],mix.numQuantities,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&mix.algorithm[0],mix.numQuantities,MPI_INT,buf,s,&pos,comm.comm); //MPI_Pack(&mix.mixingParameter[0],mix.numQuantities,MPI_DOUBLE,buf,s,&pos,comm.comm); memcpy(&buf[pos],&tmpQuantity[0],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; memcpy(&buf[pos],&mix.algorithm[0],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; memcpy(&buf[pos],&mix.mixingParameter[0],mix.numQuantities*double_size); pos = pos+mix.numQuantities*double_size; } //MPI_Bcast(buf,s,MPI_PACKED,0,comm.comm); shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast32(&buf[0], &buf[0], tot_bufsize, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank!=0) { int pos=0; //MPI_Unpack(buf,s,&pos,lsms.systemid,80,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,lsms.title,80,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,lsms.potential_file_in,128,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,lsms.potential_file_out,128,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.pot_in_type,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.pot_out_type,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.num_atoms,1,MPI_INT,comm.comm); memcpy(&lsms.systemid,&buf[pos],80*char_size); pos = pos+80*char_size; memcpy(&lsms.title,&buf[pos],80*char_size); pos = pos+80*char_size; memcpy(&lsms.potential_file_in,&buf[pos],128*char_size); pos = pos+128*char_size; memcpy(&lsms.potential_file_out,&buf[pos],128*char_size); pos = pos+128*char_size; memcpy(&lsms.pot_in_type,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.pot_out_type,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.num_atoms,&buf[pos],int_size); pos = pos+int_size; crystal.num_atoms=lsms.num_atoms; //MPI_Unpack(buf,s,&pos,&lsms.nspin,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nrel_rel,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nrelc,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nrelv,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.n_spin_cant,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.n_spin_pola,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.mtasa,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.fixRMT,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.nscf,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.writeSteps,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.clight,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.grid,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.npts,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.ebot,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.etop,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.eibot,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.eitop,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.energyContour.maxGroupSize,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.mixing,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.alphaDV,1,MPI_DOUBLE,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.iprint,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.print_node,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.default_iprint,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.istop,32,MPI_CHAR,comm.comm); //MPI_Unpack(buf,s,&pos,&lsms.global.GPUThreads,32,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&crystal.num_types,1,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&crystal.bravais(0,0),9,MPI_DOUBLE,comm.comm); memcpy(&lsms.nspin,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nrel_rel,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nrelc,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nrelv,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.n_spin_cant,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.n_spin_pola,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.mtasa,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.fixRMT,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.nscf,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.writeSteps,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.clight,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.grid,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.energyContour.npts,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.energyContour.ebot,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.etop,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.eibot,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.eitop,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.energyContour.maxGroupSize,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.mixing,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.alphaDV,&buf[pos],double_size); pos = pos+double_size; memcpy(&lsms.global.iprint,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.global.print_node,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.global.default_iprint,&buf[pos],int_size); pos = pos+int_size; memcpy(&lsms.global.istop,&buf[pos],32*char_size); pos = pos+32*char_size; memcpy(&lsms.global.GPUThreads,&buf[pos],32*int_size); pos = pos+32*int_size; memcpy(&crystal.num_types,&buf[pos],int_size); pos = pos+int_size; memcpy(&crystal.bravais(0,0),&buf[pos],9*double_size); pos = pos+9*double_size; crystal.resize(crystal.num_atoms); crystal.resizeTypes(crystal.num_types); // MixingParameters // MPI_CXX_BOOL is not always available // MPI_Unpack(buf,s,&pos,&mix.quantity[0],mix.numQuantities,MPI_CXX_BOOL,comm.comm); // recieve temporary int array and copy int tmpQuantity[mix.numQuantities]; //MPI_Unpack(buf,s,&pos,&tmpQuantity[0],mix.numQuantities,MPI_INT,comm.comm); memcpy(&tmpQuantity[0],&buf[pos],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; for(int i=0; i<mix.numQuantities; i++) if(tmpQuantity[i]==1) mix.quantity[i] = true; else mix.quantity[i] = false; //MPI_Unpack(buf,s,&pos,&mix.algorithm[0],mix.numQuantities,MPI_INT,comm.comm); //MPI_Unpack(buf,s,&pos,&mix.mixingParameter[0],mix.numQuantities,MPI_DOUBLE,comm.comm); memcpy(&mix.algorithm[0],&buf[pos],mix.numQuantities*int_size); pos = pos+mix.numQuantities*int_size; memcpy(&mix.mixingParameter[0],&buf[pos],mix.numQuantities*double_size); pos = pos+mix.numQuantities*double_size; } for(int i=0; i<mix.numQuantities; i++) printf("mix.quantity[%d]=%d\n", i,mix.quantity[i]); // Allocate buffer for transmitting Crystal params int buff_size; if((crystal.num_types*sizeof(AtomType)) > (3*crystal.num_atoms*double_size)) buff_size = crystal.num_types*sizeof(AtomType); else buff_size = 3*crystal.num_atoms*double_size; shfree(buf); // TODO finetune buff-size buff_size=1048576; //sizeof(LSMSSystemParameters)+9*sizeof(Real); rem=buff_size%64; ele=buff_size/64; if(rem != 0) { buff_size=buff_size-rem+64; ele++; } double* temp_buff=(double*) shmalloc(buff_size); int* temp_intbuff=(int*) shmalloc(buff_size); //MPI_Bcast(&crystal.position(0,0),3*crystal.num_atoms,MPI_DOUBLE,0,comm.comm); //TODO check if a barrier is neededa after broadcast ... data not updated otherwise if(comm.comm.rank == 0) memcpy(temp_buff,&crystal.position(0,0),3*crystal.num_atoms*double_size); shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast64(temp_buff, temp_buff,3*crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0) memcpy(&crystal.position(0,0),temp_buff,3*crystal.num_atoms*double_size); //MPI_Bcast(&crystal.evecs(0,0),3*crystal.num_atoms,MPI_DOUBLE,0,comm.comm); if(comm.comm.rank == 0){ memcpy(temp_buff,&crystal.evecs(0,0),3*crystal.num_atoms*double_size); } shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast64(temp_buff, temp_buff, 3*crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0){ memcpy(&crystal.evecs(0,0),temp_buff,3*crystal.num_atoms*double_size); } //MPI_Bcast(&crystal.type[0],crystal.num_atoms,MPI_INT,0,comm.comm); if(comm.comm.rank == 0){ memcpy(temp_intbuff,&crystal.type[0],crystal.num_atoms*int_size); } shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast32(temp_intbuff, temp_intbuff, crystal.num_atoms, 0, 0, 0, comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0){ memcpy(&crystal.type[0],temp_intbuff,crystal.num_atoms*int_size); } // This is dangerous and assumes homogeneous nodes: //MPI_Bcast(&crystal.types[0],crystal.num_types*sizeof(AtomType),MPI_BYTE,0,comm.comm); if(comm.comm.rank == 0) memcpy(temp_buff,&crystal.types[0],crystal.num_types*sizeof(AtomType)); // having to use the smallest possible broadcast:"32"-type shmem_barrier(0, 0, comm.comm.size,pSync2); shmem_broadcast32(temp_buff,temp_buff,crystal.num_types*sizeof(AtomType)/4,0,0,0,comm.comm.size,pSync1); shmem_barrier(0, 0, comm.comm.size,pSync2); if(comm.comm.rank != 0) memcpy(&crystal.types[0],temp_buff,crystal.num_types*sizeof(AtomType)); shmem_barrier(0, 0, comm.comm.size,pSync1); shfree(temp_buff); shfree(temp_intbuff); // get maximum lmax crystal.maxlmax=0; for(int i=0; i<crystal.num_types; i++) if(crystal.types[i].lmax>crystal.maxlmax) crystal.maxlmax=crystal.types[i].lmax; lsms.maxlmax=crystal.maxlmax; }
int main(int argc, char *argv[]){ int i,n,next_pivot, pivot; long pSync[_SHMEM_BCAST_SYNC_SIZE]; for (i=0; i < SHMEM_BCAST_SYNC_SIZE; i++) { pSync[i] = _SHMEM_SYNC_VALUE; } start_pes(0); me = shmem_my_pe(); npes = shmem_n_pes(); shmem_barrier_all(); srand (me+time(NULL)); N = atoi(argv[1]); //int *nelems = (int*) shmalloc(sizeof(int)); //int *nelems_import= (int*) shmalloc(sizeof(int));; printf("%d: Size = %d with np=%d\n",me,N,npes); A = (int *)shmalloc((N/npes)*sizeof(int)); temp_arr = (int *)shmalloc((N/npes)*sizeof(int)); if(A==NULL){ printf("\nOut of memory"); return 1; } n= N/npes; i=0; while(i<N/npes){ A[i] = rand()%(10000-0); i++; } printf("\nprocess %d elements:",me); for(i=0;i<(N/npes);i++){ printf("%d, ", A[i]); } next_pivot = A[0]; //the step two of algo.....broadcast the new pivot shmem_broadcast32(&next_pivot,A,1,0,0,0,npes,pSync); shmem_barrier_all(); pivot = quicksort(A, 0, n-1); printf("Process %d the pivot:%d",me, pivot); shmem_barrier_all(); //just for the sake of clear display...can be removed in the end printf("\nThe sorted list is of process %d: ",me); for(i=0;i<n;i++){ printf("%d, ",A[i]); } printf("\n"); printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots int check,j; //to check the division of the sorted arrays according to the new pivot. shmem_barrier_all(); check = uplowPartition(next_pivot); shmem_barrier_all(); printf("(%d)",me); for(int j=0;j<N/npes;j++){ printf("%d, ",A[j]); } printf("new partition: %d",check); shmem_barrier_all(); if(me < npes/2) { i=0; // printf("Hello from %d", me); printf("\n"); for(j=check;j<N/npes;j++){ temp_arr[i] = A[j]; i++; } i=0; printf("(%d)",me); for(j=check;j<N/npes;j++){ printf("%d, ",temp_arr[i]) ; i++; } // printf("\n"); } shmem_barrier_all(); if(me >= npes/2) { // printf("Hello from %d", me); printf("\n"); for(j=0;j<check;j++){ temp_arr[j] = A[j]; } printf("(%d)",me); for(j=0;j<check;j++){ printf("%d, ",temp_arr[j]) ; } // printf("\n"); } shmem_barrier_all(); printf("\n"); if(me < npes/2){ printf("\n"); pe = me +npes/2; nelems[0] = N/npes - check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,&A[check],nelems[0],pe); } shmem_barrier_all();//check if the entire barrier is needed if(me >= npes/ 2){ pe = me-npes/2;//check if it is synced nelems[0]= check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,A,nelems[0],pe); } shmem_barrier_all();//again sync is required...check it with profiling //this snippet is to check if the processors have got the high and low lists respectively ------------------- printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value printf("(%d) new elements = ",me); for(i=0;i<nelems_import[0];i++){ printf("%d, ",temp_arr[i]); } printf("\n"); //------------------------------------here this checking snippet ends---- //----------------------------------merging of arrays begin------------------------- if(me < npes/2){ i=0; for(j=nelems_import[0];j<(nelems_import[0]+check);j++){ temp_arr[j] = A[i]; i++; } } if(me >= npes/2){ i=check; for(j=nelems_import[0];j<(nelems_import[0]+N/npes-check);j++){ temp_arr[j] = A[i]; i++; } } shmem_barrier_all(); //to test if the arrays are merged properly int size; if(me < npes/2){ size = (nelems_import[0]+check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } if(me >= npes/2){ size = (nelems_import[0]+N/npes-check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } //-----------------------check of merging finishes-------- //--------------------------------------------------merging finishes------------------------------ //-----------------------sort again----------------------------------------------- if(me < npes/2){ quicksort(temp_arr,0,(nelems_import[0]+check-1)); } if(me >= npes/2){ quicksort(temp_arr,0,(nelems_import[0]+N/npes-check-1)); } //sorting routine checked...once program is done we can remove this part------------- shmem_barrier_all();//test purpose only if(me < npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); } printf("\n"); } if(me >= npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); } printf("\n"); } //------------------------------------------------------------- //--------------------------------------------------------------------------------- shfree(temp_arr); shfree(A); shmem_finalize(); }
int main (int argc, char **argv) { /* arrays used to contain each PE's rows - specify cols, no need to spec rows */ float **U_Curr; float **U_Next; /* helper variables */ /* available iterator */ int i, j, k, m, n; int per_proc, remainder, my_start_row, my_end_row, my_num_rows; int verbose = 0; int show_time = 0; double time; double t, tv[2]; /*OpenSHMEM initilization*/ start_pes (0); p = _num_pes (); my_rank = _my_pe (); if (p > 8) { fprintf(stderr, "Ignoring test when run with more than 8 pes\n"); return 77; } /* argument processing done by everyone */ int c, errflg; extern char *optarg; extern int optind, optopt; while ((c = getopt (argc, argv, "e:h:m:tw:v")) != -1) { switch (c) { case 'e': EPSILON = atof (optarg); break; case 'h': HEIGHT = atoi (optarg); break; case 'm': /* selects the numerical methods */ switch (atoi (optarg)) { case 1: /* jacobi */ meth = 1; break; case 2: /* gauss-seidel */ meth = 2; break; case 3: /* sor */ meth = 3; break; } break; case 't': show_time++; /* overridden by -v (verbose) */ break; case 'w': WIDTH = atoi (optarg); break; case 'v': verbose++; break; /* handle bad arguments */ case ':': /* -h or -w without operand */ if (ROOT == my_rank) fprintf (stderr, "Option -%c requires an operand\n", optopt); errflg++; break; case '?': if (ROOT == my_rank) fprintf (stderr, "Unrecognized option: -%c\n", optopt); errflg++; break; } } if (ROOT == my_rank && argc < 2) { printf ("Using defaults: -h 20 -w 20 -m 2\n"); } // if (0 < errflg) // exit(EXIT_FAILURE); /* wait for user to input runtime params */ for (i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i += 1) pSync[i] = _SHMEM_SYNC_VALUE; shmem_barrier_all (); /* broadcast method to use */ shmem_broadcast32 (&meth, &meth, 1, 0, 0, 0, p, pSync); switch (meth) { case 1: method = &jacobi; break; case 2: method = &gauss_seidel; break; case 3: method = &sor; break; } /* let each processor decide what rows(s) it owns */ my_start_row = get_start (my_rank); my_end_row = get_end (my_rank); my_num_rows = get_num_rows (my_rank); if (0 < verbose) printf ("proc %d contains (%d) rows %d to %d\n", my_rank, my_num_rows, my_start_row, my_end_row); fflush (stdout); /* allocate 2d array */ U_Curr = (float **) malloc (sizeof (float *) * my_num_rows); U_Curr[0] = (float *) malloc (sizeof (float) * my_num_rows * (int) floor (WIDTH / H)); for (i = 1; i < my_num_rows; i++) { U_Curr[i] = U_Curr[i - 1] + (int) floor (WIDTH / H); } /* allocate 2d array */ U_Next = (float **) malloc (sizeof (float *) * my_num_rows); U_Next[0] = (float *) malloc (sizeof (float) * my_num_rows * (int) floor (WIDTH / H)); for (i = 1; i < my_num_rows; i++) { U_Next[i] = U_Next[i - 1] + (int) floor (WIDTH / H); } /* initialize global grid */ init_domain (U_Curr, my_rank); init_domain (U_Next, my_rank); /* iterate for solution */ if (my_rank == ROOT) { tv[0] = gettime (); } k = 1; while (1) { method (U_Curr, U_Next); local_convergence_sqd = get_convergence_sqd (U_Curr, U_Next, my_rank); shmem_barrier_all (); shmem_float_sum_to_all (&convergence_sqd, &local_convergence_sqd, 1, 0, 0, p, pWrk, pSync); if (my_rank == ROOT) { convergence = sqrt (convergence_sqd); if (verbose == 1) { printf ("L2 = %f\n", convergence); } } /* broadcast method to use */ shmem_barrier_all (); shmem_broadcast32 (&convergence, &convergence, 1, 0, 0, 0, p, pSync); if (convergence <= EPSILON) { break; } /* copy U_Next to U_Curr */ for (j = my_start_row; j <= my_end_row; j++) { for (i = 0; i < (int) floor (WIDTH / H); i++) { U_Curr[j - my_start_row][i] = U_Next[j - my_start_row][i]; } } k++; //MPI_Barrier(MPI_COMM_WORLD); shmem_barrier_all (); } /* say something at the end */ if (my_rank == ROOT) { //time = MPI_Wtime() - time; tv[1] = gettime (); t = dt (&tv[1], &tv[0]); printf ("Estimated time to convergence in %d iterations using %d processors on a %dx%d grid is %f seconds\n", k, p, (int) floor (WIDTH / H), (int) floor (HEIGHT / H), t / 1000000.0); } //MPI_Finalize(); exit (EXIT_SUCCESS); return 0; }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* SHMEM rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in[2]; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in[2];/* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in[2]; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in[2]; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ int stencil_size; /* number of points in the stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double avgtime, /* timing parameters */ *local_stencil_time, *stencil_time; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ int *arguments; /* command line parameters */ int count_case=4; /* number of neighbors of a rank */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk_time; /* work space for collectives */ DTYPE *pWrk_norm; /* work space for collectives */ int *iterflag; /* synchronization flags */ int sw; /* double buffering switch */ DTYPE *local_norm, *norm; /* local and global error norms */ /******************************************************************************* ** Initialize the SHMEM environment ********************************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); pSync_bcast = (long *) prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk_time = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double)); pWrk_norm = (DTYPE *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE)); local_stencil_time = (double *) prk_shmem_malloc(sizeof(double)); stencil_time = (double *) prk_shmem_malloc(sizeof(double)); local_norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); norm = (DTYPE *) prk_shmem_malloc(sizeof(DTYPE)); iterflag = (int *) prk_shmem_malloc(2*sizeof(int)); if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag && local_stencil_time && stencil_time && local_norm && norm)) { printf("Could not allocate scalar variables on rank %d\n", my_ID); error = 1; } bail_out(error); for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; arguments=(int*)prk_shmem_malloc(2*sizeof(int)); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 3){ printf("Usage: %s <# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); arguments[1]=n; long nsquare = (long)n * (long)n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; iterflag[0] = iterflag[1] = 0; if(my_IDx==0) count_case--; if(my_IDx==Num_procsx-1) count_case--; if(my_IDy==0) count_case--; if(my_IDy==Num_procsy-1) count_case--; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM stencil execution on 2D grid\n"); printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #ifdef DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif #if SPLITFENCE printf("Split fence = ON\n"); #else printf("Split fence = OFF\n"); #endif printf("Number of iterations = %d\n", iterations); } shmem_barrier_all(); shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; n=arguments[1]; shmem_barrier_all(); prk_shmem_free(arguments); /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width + 1; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height + 1; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS); total_length_in *= (height+2*RADIUS); total_length_in *= sizeof(DTYPE); total_length_out = width; total_length_out *= height; total_length_out *= sizeof(DTYPE); in = (DTYPE *) malloc(total_length_in); out = (DTYPE *) malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm[0] = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); bottom_buf_out = top_buf_out+RADIUS*width; top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width); if(!top_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID); error=1; } bail_out(error); top_buf_in[1] = top_buf_in[0] + RADIUS*width; bottom_buf_in[0] = top_buf_in[1] + RADIUS*width; bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width; right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); left_buf_out=right_buf_out+RADIUS*height; right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height); if(!right_buf_in) { printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID); error=1; } bail_out(error); right_buf_in[1] = right_buf_in[0] + RADIUS*height; left_buf_in[0] = right_buf_in[1] + RADIUS*height; left_buf_in[1] = left_buf_in[0] + RADIUS*height; /* make sure all symmetric heaps are allocated before being used */ shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_stencil_time[0] = wtime(); } /* sw determines which incoming buffer to select */ sw = iter%2; /* need to fetch ghost point data from neighbors */ if (my_IDy < Num_procsy-1) { for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], top_nbr); #endif } if (my_IDy > 0) { for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], bottom_nbr); #endif } if(my_IDx < Num_procsx-1) { for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) { right_buf_out[kk++]=IN(i,j); } shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], right_nbr); #endif } if(my_IDx>0) { for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) { left_buf_out[kk++]=IN(i,j); } shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr); #if SPLITFENCE shmem_fence(); shmem_int_inc(&iterflag[sw], left_nbr); #endif } #if SPLITFENCE == 0 shmem_fence(); if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr); if(my_IDy>0) shmem_int_inc(&iterflag[sw], bottom_nbr); if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr); if(my_IDx>0) shmem_int_inc(&iterflag[sw], left_nbr); #endif shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1)); if (my_IDy < Num_procsy-1) { for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[sw][kk++]; } } if (my_IDy > 0) { for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[sw][kk++]; } } if (my_IDx < Num_procsx-1) { for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) { IN(i,j) = right_buf_in[sw][kk++]; } } if (my_IDx > 0) { for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[sw][kk++]; } } /* Apply the stencil operator */ for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0; } local_stencil_time[0] = wtime() - local_stencil_time[0]; shmem_barrier_all(); shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0, Num_procs, pWrk_time, pSync_reduce); /* compute L1 norm in parallel */ local_norm[0] = (DTYPE) 0.0; for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) { for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) { local_norm[0] += (DTYPE)ABS(OUT(i,j)); } } shmem_barrier_all(); #ifdef DOUBLE shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #else shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce); #endif /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm[0] /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm[0]-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm[0], reference_norm); error = 1; } else { printf("Solution validates\n"); #ifdef VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm[0]); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time[0]/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } prk_shmem_free(top_buf_in); prk_shmem_free(right_buf_in); free(top_buf_out); free(right_buf_out); prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk_time); prk_shmem_free(pWrk_norm); prk_shmem_finalize(); exit(EXIT_SUCCESS); }
int hyperquick(int *A, int N, int npes){ int pivot; int i; //the step two of algo.....broadcast the new pivot //pivot = quicksort(A, 0, n-1); next_pivot = A[N/2]; //the median //shmem_barrier_all(); //printf("(%d) N= %d\n",me,N); shmem_broadcast32(&next_pivot,&next_pivot,1,0,0,0,npes,pSync); shmem_barrier_all(); /*printf("Process %d the pivot:%d",me, pivot); shmem_barrier_all(); //just for the sake of clear display...can be removed in the end printf("\nThe sorted list is of process %d: ",me); for(i=0;i<N/npes;i++){ printf("%d, ",A[i]); } printf("\n");*/ printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots int check,j; //to check the division of the sorted arrays according to the new pivot. shmem_barrier_all(); check = uplowPartition(A, next_pivot, N*npes, npes); shmem_barrier_all(); printf("(%d)",me); for(int j=0;j<N;j++){ printf("%d, ",A[j]); } printf("new partition: %d",check); shmem_barrier_all(); printf("\n"); if(me < npes/2){ printf("\n"); pe = me +npes/2; nelems[0] = N - check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,&A[check],nelems[0],pe); } shmem_barrier_all();//check if the entire barrier is needed if(me >= npes/ 2){ pe = me-npes/2;//check if it is synced nelems[0]= check; printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value shmem_int_p(nelems_import,nelems[0],pe); shmem_quiet(); shmem_int_put(temp_arr,A,nelems[0],pe); } shmem_barrier_all();//again sync is required...check it with profiling //this snippet is to check if the processors have got the high and low lists respectively ------------------- printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value printf("(%d) new elements = ",me); for(i=0;i<nelems_import[0];i++){ printf("%d, ",temp_arr[i]); } printf("\n"); //------------------------------------here this checking snippet ends---- //----------------------------------merging of arrays begin------------------------- if(me < npes/2){ i=0; for(j=nelems_import[0];j<(nelems_import[0]+check);j++){ temp_arr[j] = A[i]; i++; } } if(me >= npes/2){ i=check; for(j=nelems_import[0];j<(nelems_import[0]+N-check);j++){ temp_arr[j] = A[i]; i++; } } shmem_barrier_all(); //to test if the arrays are merged properly int size; if(me < npes/2){ size = (nelems_import[0]+check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } if(me >= npes/2){ size = (nelems_import[0]+N-check); printf("(%d) merged array:",me); for(j=0;j<size;j++){ printf("%d, ",temp_arr[j]); } printf("\n"); } //-----------------------check of merging finishes-------- //--------------------------------------------------merging finishes------------------------------ //-----------------------sort again----------------------------------------------- if(me < npes/2){ quicksort(temp_arr,0,(nelems_import[0]+check-1)); } if(me >= npes/2){ quicksort(temp_arr,0,(nelems_import[0]+N-check-1)); } //sorting routine checked...once program is done we can remove this part------------- shmem_barrier_all();//test purpose only if(me < npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); A[i] = temp_arr[i]; } printf("\n"); } if(me >= npes/2){ printf("(%d) sorted list: ",me); for(i=0;i<size;i++){ printf("%d, ",temp_arr[i]); A[i] = temp_arr[i]; } printf("\n"); } //------------------------------------------------------------- //--------------------------------------------------------------------------------- //hyperquick(A,size,npes/2); }