void check_events(int *events, int nb_events) { int i, retval; if ( nb_events > PAPI_num_counters() ) { fprintf(stderr, "Too many counters: %d\nMaximum available %d\n", nb_events, PAPI_num_counters()); } for (i=0; i < nb_events; i++) { if ( (retval = PAPI_query_event(events[i])) != PAPI_OK) { PAPI_event_info_t info; PAPI_get_event_info(events[i], &info); fprintf(stderr,"No instruction counter for \"%s\" event\n%s\n", info.short_descr, PAPI_strerror(retval)); } } }
void initBigSimTrace(int outputParams, int _outputTiming) { CkpvInitialize(int, outputParameters); CkpvAccess(outputParameters) = outputParams; bgTraceCounter = 0; #ifdef CMK_BIGSIM_CHARM if (!BgIsReplay()) outputTiming = 0; outputTiming = _outputTiming; #endif CkpvInitialize(bool, insideTraceBracket); CkpvAccess(insideTraceBracket) = false; CkpvInitialize(double, start_time); CkpvInitialize(double, end_time); CkpvInitialize(FILE*, bgfp); CkpvAccess(bgfp) = NULL; #ifdef CMK_BIGSIM_CHARM // for bigsim emulation, write to files, one for each processor // always write immediately, instead of store and dump at the end if (!outputTiming) { char fname[128]; const char *subdir = "params"; CmiMkdir(subdir); sprintf(fname, "%s/param.%d", subdir, CkMyPe()); CkpvAccess(bgfp) = fopen(fname, "w"); if (CkpvAccess(bgfp) == NULL) CmiAbort("Failed to generated trace param file!"); } #endif // for Mambo simulation, write to screen for now // CkpvAccess(bgfp) = stdout; if (CkpvAccess(outputParameters)) { CkpvInitialize(StringPool, eventsPool); if (CkMyPe()==0) CmiPrintf("outputParameters enabled!\n"); #ifdef CMK_BIGSIM_CHARM BgRegisterUserTracingFunction(finalizeBigSimTrace); #endif } #ifdef BIG_SIM_PAPI CkPrintf("PAPI: number of available counters: %d\n", PAPI_num_counters()); CkAssert(PAPI_num_counters() >= 0); #endif }
void start_papi() { values = calloc(en, sizeof(long long)); if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { fprintf(stderr, "PAPI is unsupported.\n"); papi_supported = 0; } if (PAPI_num_counters() < en) { fprintf(stderr, "PAPI is unsupported.\n"); papi_supported = 0; } if ((papi_err = PAPI_create_eventset(&eventSet)) != PAPI_OK) { fprintf(stderr, "Could not create event set: %s\n", PAPI_strerror(papi_err)); } for (int i=0; i<en; ++i) { if ((papi_err = PAPI_add_event(eventSet, events[i])) != PAPI_OK ) { fprintf(stderr, "Could not add event: %s %s\n", event_names[i], PAPI_strerror(papi_err)); } } /* start counters */ if (papi_supported) { if ((papi_err = PAPI_start(eventSet)) != PAPI_OK) { fprintf(stderr, "Could not start counters: %s\n", PAPI_strerror(papi_err)); } } }
void papi_init(int eventNumber){ if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { fprintf(stderr, "PAPI is unsupported.\n"); papi_supported = false; } if (PAPI_num_counters() < 5) { fprintf(stderr, "PAPI is unsupported.\n"); papi_supported = false; } if ((papi_err = PAPI_create_eventset(&eventSet)) != PAPI_OK) { fprintf(stderr, "Could not create event set: %s\n", PAPI_strerror(papi_err)); } /* force program to run on a single CPU */ cpu_set_t my_set; /* Define your cpu_set bit mask. */ CPU_ZERO(&my_set); /* Initialize it all to 0, i.e. no CPUs selected. */ CPU_SET(0, &my_set); if (sched_setaffinity(0, sizeof(cpu_set_t), &my_set) != 0) { perror("sched_setaffinity error"); } if ((papi_err = PAPI_add_event(eventSet, events[eventNumber])) != PAPI_OK ) { fprintf(stderr, "Could not add event: %s\n", PAPI_strerror(papi_err)); } }
// // This method should be placed at the start of instrumented code // void startPapiCounters(){ initializeCounters(0); #ifdef DBG printGEvents(); printf("********* STARTING COUNTERS *************\n"); //assert(NUM_EVENTS == _G_EVENT_COUNT); #endif // initialize papi library and assert that it's successful _CALL_PAPI(PAPI_library_init( PAPI_VER_CURRENT )); // check that all the events can be counted at once. int numCounters = PAPI_num_counters() ; assert( NUM_EVENTS <= numCounters ); #ifdef DBG printf("Number of hardware counters available on this machine: %d", numCounters); #endif for ( int i = 0; i < NUM_EVENTS; i++ ) { char name[PAPI_MAX_STR_LEN]; (void) _CALL_PAPI(PAPI_event_code_to_name( _G_EVENTS[i], name )); if(PAPI_query_event( _G_EVENTS[i] ) < PAPI_OK) { fprintf(stderr, "Event %s could not be counted on this machine.\n", name); abort(); } } //******* Start Counters ****** (void) _CALL_PAPI(PAPI_start_counters(_G_EVENTS, NUM_EVENTS)); }
void perf_lib_init(const char *perfcfg, const char *perfout) { int retval; retval = PAPI_library_init(PAPI_VER_CURRENT); assert(retval == PAPI_VER_CURRENT); retval = PAPI_thread_init(pthread_self); assert(retval == PAPI_OK); if (perfcfg) PERF_CONFIG = mystrdup(perfcfg); if (perfout) PERF_OUT = mystrdup(perfout); int max_counters = PAPI_num_counters(); if (max_counters > MAX_PERF_EVENTS) max_counters = MAX_PERF_EVENTS; PERF_EVENT_NAMES = (char**)malloc(sizeof(char*) * max_counters); assert(PERF_EVENT_NAMES != NULL); memset(PERF_EVENT_NAMES, 0x0, sizeof(char*) * max_counters); if (PERF_CONFIG) { char line[80]; FILE *config = fopen(PERF_CONFIG, "r"); assert(config != NULL); while (fgets(line, 80, config) != NULL && NUM_EVENTS < max_counters) { if (line[0]=='#') continue; PERF_EVENT_NAMES[NUM_EVENTS] = mystrdup(line); NUM_EVENTS++; } if (!feof(config)) fprintf(stderr, "Too many counters added. Only take the first %d.\n", max_counters); fclose(config); } else { /* if no config file is specified, add default events only */ NUM_EVENTS = sizeof(DEFAULT_EVENTS) / sizeof(char*); if (NUM_EVENTS > max_counters) { NUM_EVENTS = max_counters; fprintf(stderr, "Too many counters added. Only take the first %d.\n", max_counters); } for (int i = 0; i < NUM_EVENTS; i++) PERF_EVENT_NAMES[i] = mystrdup(DEFAULT_EVENTS[i]); } for (int i = 0; i != NUM_EVENTS; ++i) PERF_COUNTER_INITIALIZER.value[i] = 0; }
void papi_init() { int max; /* Check PAPI sanity */ if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) papi_eprintf("PAPI_library_init error.\n"); max = PAPI_num_counters(); PAPI_reset(max); }
int main(int argc, char *argv[]) { double a[MAXVSIZE], b[MAXVSIZE], c[MAXVSIZE]; int i,n; long long before, after; if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); size_t EVENT_COUNT = 2; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS }; long long values[EVENT_COUNT]; printf("Enter vector size: "); scanf("%d",&n); for (i=0;i<n;i++) { a[i] = i; b[i] = n-i; } PAPI_start_counters(events, EVENT_COUNT); if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); loop(c,a,b,n); if(PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); printf("Number of instructions = %lld\n",values[0]); printf("Number of fp operations = %lld\n",values[1]); return 0; }
int main () { float t0, t1; int iter, i, j; int events[2] = {PAPI_L1_DCM, PAPI_FP_OPS }, ret; long_long values[2]; if (PAPI_num_counters() < 2) { fprintf(stderr, "No hardware counters here, or PAPI not supported.\n"); exit(1); } for (i = 0; i < MX; i++) { if ((ad[i] = malloc(sizeof(double)*MX)) == NULL) { fprintf(stderr,"malloc failed\n"); exit(1); } } for (j = 0; j < MX; j++) { for (i = 0; i < MX; i++) { ad[i][j] = 1.0/3.0; /* Initialize the data */ } } t0 = gettime(); if ((ret = PAPI_start_counters(events, 2)) != PAPI_OK) { fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(ret)); exit(1); } for (iter = 0; iter < NITER; iter++) { for (j = 0; j < MX; j++) { for (i = 0; i < MX; i++) { ad[i][j] += ad[i][j] * 3.0; } } } if ((ret = PAPI_read_counters(values, 2)) != PAPI_OK) { fprintf(stderr, "PAPI failed to read counters: %s\n", PAPI_strerror(ret)); exit(1); } t1 = gettime(); printf("Total software flops = %f\n",(float)TOT_FLOPS); printf("Total hardware flops = %lld\n",(float)values[1]); printf("MFlop/s = %f\n", (float)(TOT_FLOPS/MEGA)/(t1-t0)); printf("L1 data cache misses is %lld\n", values[0]); }
void papi_set_events(char *metric) { const size_t n = 1; int max; long_long *papi_tmp; int papi_events[1]; int code; max = PAPI_num_counters(); if (n > max) papi_eprintf("Too many counters requested.\n"); papi_tmp = malloc(sizeof(*papi_tmp) * n); PAPI_reset(max); PAPI_stop_counters(papi_tmp, n); if (PAPI_event_name_to_code(metric, &code) != PAPI_OK) papi_eprintf("Unknown PAPI event %s.\n", metric); if (code == 0) papi_eprintf("Unknown PAPI event %s.\n", metric); papi_events[0] = code; PAPI_start_counters(papi_events, n); if (PAPI_read_counters(papi_tmp, n) != PAPI_OK) papi_eprintf("Problem reading counters %s:%d.\n", __FILE__, __LINE__); free(papi_tmp); }
int main(int argc, char *argv[]) { int size, rank, world_rank, my_group; int num_lsms; // number of parallel LSMS instances int size_lsms; // number of atoms in a lsms instance int num_steps; // number of energy calculations int initial_steps; // number of steps before sampling starts int stepCount=0; // count the Monte Carlo steps executed double max_time; // maximum walltime for this run in seconds bool restrict_time = false; // was the maximum time specified? bool restrict_steps = false; // or the max. numer of steps? int align; // alignment of lsms_instances double magnetization; double energy_accumulator; // accumulates the enegy to calculate the mean int energies_accumulated; int new_peid,new_root; static int op,flag; double *evec,*r_values; evec=(double *)shmalloc(sizeof(double)*3*size_lsms); r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1))); energy_accumulator=0.0; energies_accumulated=0; double walltime_0,walltime; double restartWriteFrequency=30.0*60.0; double nextWriteTime=restartWriteFrequency; MPI_Comm local_comm; int *lsms_rank0; MPI_Status status; char prefix[40]; char i_lsms_name[64]; char gWL_in_name[64], gWL_out_name[64]; char mode_name[64]; char energy_calculation_name[64]; char stupid[37]; char step_out_name[64]; char wl_step_out_name[128]; char *wl_stepf=NULL; bool step_out_flag=false; std::ofstream step_out_file; typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode; typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension; EvecGenerationMode evec_generation_mode = Constant; SecondDimension second_dimension = MagneticMoment; double ev0[3]; bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step. bool generator_needs_moment=false; typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode; EnergyCalculationMode energyCalculationMode = OneStepEnergy; int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy) ev0[0]=ev0[1]=0.0; ev0[2]=1.0; // size has to be align + size_lsms*num_lsms align=1; num_lsms=1; size_lsms=-1; my_group=-1; num_steps=1; initial_steps=0; sprintf(i_lsms_name,"i_lsms"); gWL_in_name[0]=gWL_out_name[0]=0; mode_name[0]=0; energy_calculation_name[0]=0; // check command line arguments for(int i=0; i<argc; i++) { if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]); if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]); if(!strcmp("-align",argv[i])) align=atoi(argv[++i]); if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;} if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;} if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64); if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;} if(!strcmp("-step_out",argv[i])) {strncpy(step_out_name,argv[++i],64); step_out_flag=true; return_moments_flag=true;} if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64); if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64); if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64); if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64); } if(!(restrict_steps || restrict_time)) restrict_steps=true; if(mode_name[0]!=0) { if(!strcmp("constant",mode_name)) evec_generation_mode = Constant; if(!strcmp("random",mode_name)) evec_generation_mode = Random; if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d; if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing; if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d; if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;} if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;} if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;} if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;} } if(energy_calculation_name[0]!=0) { if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; } if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; } if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; } } #ifdef USE_PAPI #define NUM_PAPI_EVENTS 4 int hw_counters = PAPI_num_counters(); if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS; int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS}; char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS", "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE", "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"}; // "RETIRED_INSTRUCTIONS", // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2", // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1", // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1" // get events from names: for(int i=0; i<NUM_PAPI_EVENTS; i++) { if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK) { // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]), // std::cerr,parameters.myrankWorld); // printline("Skipping all following events", // std::cerr,parameters.myrankWorld); if(hw_counters>i) hw_counters=i; } } long long papi_values[NUM_PAPI_EVENTS+4]; // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld); if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS; long long papi_real_cyc_0 = PAPI_get_real_cyc(); long long papi_real_usec_0 = PAPI_get_real_usec(); long long papi_virt_cyc_0 = PAPI_get_virt_cyc(); long long papi_virt_usec_0 = PAPI_get_virt_usec(); PAPI_start_counters(papi_events,hw_counters); #endif lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1)); // initialize MPI: MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); world_rank=rank; MPI_Comm_size(MPI_COMM_WORLD, &size); walltime_0 = get_rtc(); #ifndef SVN_REV #define SVN_REV "unknown" #endif // make sure 'return_moments_flag' is set correctly switch(evec_generation_mode) { case Constant : break; case Random : break; case WangLandau_1d : return_moments_flag = true; generator_needs_moment = true; break; case ExhaustiveIsing : break; case WangLandau_2d : return_moments_flag = true; generator_needs_moment = true; break; default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } if(rank==0) { std::cout<<"LSMS_3"<<std::endl; std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl; #ifdef USE_PAPI std::cout<<" Using Papi counters"<<std::endl<<std::endl; #endif std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n"; std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl; std::cout<<" LSMS Energy calculated using "; switch(energyCalculationMode) { case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break; case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break; case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break; default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1); } if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl; if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n"; std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl; switch(evec_generation_mode) { case Constant : std::cout<<" Constant moments direction along " <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl; break; case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl; break; case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl; // return_moments_flag = true; // generator_needs_moment = true; break; case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break; case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, "; switch(second_dimension) { case MagneticMoment : std::cout<<"magnitude of magnetization)"; break; case MagneticMomentX : std::cout<<"x component of magnetization)"; break; case MagneticMomentY : std::cout<<"y component of magnetization)"; break; case MagneticMomentZ : std::cout<<"z component of magnetization)"; break; } std::cout<<std::endl; // return_moments_flag = true; // generator_needs_moment = true; break; default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl; std::cout<<std::endl; if(step_out_flag && (evec_generation_mode==WangLandau_1d)) { // step_out_flag=false; snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name); wl_stepf=wl_step_out_name; } if(step_out_flag) { step_out_file.open(step_out_name); step_out_file<<"#"; for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i]; step_out_file<<std::endl<<size_lsms<<std::endl; } } if(generator_needs_moment) return_moments_flag=true; if(num_lsms==1) { SHMEM_activeset local_comm; local_comm.rank=shmem_my_pe(); local_comm.size=shmem_n_pes(); local_comm.start_pe=0; local_comm.logPE_stride=0; LSMS lsms_calc(local_comm,i_lsms_name,"1_"); if(rank==0) { std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n"; std::cout<<" LSMS version = "<<lsms_calc.version()<<std::endl; } if(energyCalculationMode==OneStepEnergy) std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl; else if(energyCalculationMode==MultiStepEnergy) std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl; else if(energyCalculationMode==ScfEnergy) std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl; else { printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n"); // MPI_Abort(MPI_COMM_WORLD,5); exit(5); } } else { // build the communicators //int color=MPI_UNDEFINED; //Assuming user passes a power of two while using "-align" int s = align; int comm_size=(size-align)/num_lsms; int world_rank; for(int i=0; i<num_lsms; i++) { if((world_rank>=s) && (world_rank<s+comm_size)) { my_group=i; //color=i; new_peid=world_rank-s; new_root=s; } lsms_rank0[i]=s; s+=comm_size; } if(world_rank==0){ //color=num_lsms; new_peid=0; comm_size=1; new_root=0; } //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm); SHMEM_activeset local_comm; local_comm.rank=new_peid; local_comm.size=comm_size; local_comm.start_pe=new_root; local_comm.logPE_stride=0; std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl; snprintf(prefix,38,"Group %4d: ",my_group); // now we get ready to do some calculations... if(my_group>=0) { double energy; double band_energy; int static i_values[10]; double static r_values[10]; static int op; //MPI_Comm_rank(local_comm, &rank); rank = local_comm.rank; snprintf(prefix,38,"%d_",my_group); // to use the ramdisk on jaguarpf: // snprintf(prefix,38,"/tmp/ompi/%d_",my_group); LSMS lsms_calc(local_comm,i_lsms_name,prefix); snprintf(prefix,38,"Group %4d: ",my_group); if(rank==0 && my_group==0) { std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n"; std::cout<<prefix<<" LSMS version = "<<lsms_calc.version()<<std::endl; } // wait for commands from master bool finished=false; while(!finished) { if(rank==0) { //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status); //op =status.MPI_TAG; if (lsms_rank0[0]==world_rank) shmem_barrier(0, lsms_rank0[0], 2, pSync1); } //MPI_Bcast(&op,1,MPI_INT,0,local_comm); shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); /* recognized opcodes: 5: calculate energy recognized energy calculation modes: OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef) use only if the Fermi energy will not change due to MC steps! The only method available in LSMS_1.9 MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy This should be the new default method. If the Fermi energy doesn't change multiStepEnergy only performs one step and should be equivalent to oneStepEnergy The tolerance for Ef convergence can be set with LSMS::setEfTol(Real). The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6). The maximum number of steps is read from the LSMS input file 'nscf' parameter. ScfEnergy : this will calculate the selfconsistent total energy. The maximum number of steps is read from the LSMS input file 'nscf' parameter. NOT IMPLEMENTED YET!!! 10: get number of sites */ if(op==5) { lsms_calc.setEvec(evec); if(energyCalculationMode==OneStepEnergy) energy=lsms_calc.oneStepEnergy(&band_energy); else if(energyCalculationMode==MultiStepEnergy) band_energy=energy=lsms_calc.multiStepEnergy(); else if(energyCalculationMode==ScfEnergy) energy=lsms_calc.scfEnergy(&band_energy); else { printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n"); //MPI_Abort(MPI_COMM_WORLD,5); exit(5); } r_values[0]=energy; r_values[1]=band_energy; if(return_moments_flag) { lsms_calc.getMag(&r_values[R_VALUE_OFFSET]); } if(rank==0) { if(return_moments_flag) { //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD); shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0); } else { //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD); shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0); } shmem_fence(); shmem_int_swap(&flag, world_rank, 0); } } else if(op==10) { i_values[0]=lsms_calc.numSpins(); //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD); shmem_int_put(i_values, i_values, 10, 0); } else { // printf("world rank %d: recieved exit\n",world_rank); finished=true; } } shfree(evec); //shfree(r_values); } else if(world_rank==0) { int running; double **evecs; //double *r_values; //int i_values[10]; int *init_steps; int total_init_steps; bool accepted; char *wl_inf=NULL; char *wl_outf=NULL; if(gWL_in_name) wl_inf=gWL_in_name; if(gWL_out_name) wl_outf=gWL_out_name; EvecGenerator *generator; /* // get number of spins from first LSMS instance // temp r_values: r_values=(double *)malloc(sizeof(double)*10); MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD); free(r_values); MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status); if(i_values[0]!=size_lsms) { printf("Size specified for Wang-Landau and in LSMS input file don't match!\n"); size_lsms=i_values[0]; } */ evecs=(double **)shmalloc(sizeof(double *)*num_lsms); init_steps=(int *)shmalloc(sizeof(int)*num_lsms); for(int i=0; i<num_lsms; i++) { evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms); init_steps[i]=initial_steps; } total_init_steps=num_lsms*initial_steps; // Initialize the correct evec generator switch(evec_generation_mode) { case Random : generator = new RandomEvecGenerator(size_lsms); break; case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms); break; //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms, // evecs, wl_inf, wl_outf, wl_stepf); case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms, evecs, wl_inf, wl_outf, wl_stepf); break; case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms, evecs, wl_inf, wl_outf); break; //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms, // evecs, wl_inf, wl_outf, wl_stepf); case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms, evecs, wl_inf, wl_outf, wl_stepf); break; default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n"; exit(1); } for(int i=0; i<num_lsms; i++) { generator->initializeEvec(i,evecs[i]); } std::cout<<"This is the master node\n"; // issue initial commands to all LSMS instances running=0; bool more_work=true; if(total_init_steps>0) { for(int i=0; i<num_lsms; i++) { std::cout<<"starting initial calculation in group "<<i<<std::endl; //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD); shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]); shmem_int_p(&op, 5, lsms_rank0[i]); shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; } shmem_barrier(0, lsms_rank0[0], 2, pSync1); // first deal with the initial steps: while(running>0) { //if(return_moments_flag) // MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); //else // MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); shmem_int_wait(&flag,-1); running--; // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl; // std::cout<<" band energy E_band="<<r_values[1]<<std::endl; if(total_init_steps>0) { //int r_group=(status.MPI_SOURCE-align)/comm_size; int r_group=(flag-align)/comm_size; std::cout<<"starting additional calculation in group "<<r_group<<std::endl; if(init_steps[r_group]>0) { more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex])); init_steps[r_group]--; total_init_steps--; } //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD); shmem_double_put(r_values, evecs[r_group], 3*size_lsms, lsms_rank0[r_group]); //TODO check this shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps && num_steps<=0) more_work=false; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; walltime = get_rtc() - walltime_0; if(restrict_time && walltime>=max_time) more_work=false; if(restrict_time) std::cout<<" "<<max_time-walltime<<" seconds remaining\n"; } } } more_work=true; running=0; for(int i=0; i<num_lsms; i++) { std::cout<<"starting main calculation in group "<<i<<std::endl; //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD); shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]); shmem_int_p(&op, 5, lsms_rank0[i]); shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; } shmem_barrier(0, lsms_rank0[0], 2, pSync1); generator->startSampling(); // wait for results and issue new commands or wind down while(running>0) { //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status); shmem_int_wait(&flag,-1); running--; std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl; std::cout<<" band energy E_band="<<r_values[1]<<std::endl; // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE); energy_accumulator+=r_values[0]; energies_accumulated++; if(more_work) { int r_group=(status.MPI_SOURCE-align)/comm_size; std::cout<<"starting additional calculation in group "<<r_group<<std::endl; if(generator_needs_moment) { double m0,m1,m2; m0=0.0; m1=0.0; m2=0.0; for(int i=0; i<3*size_lsms; i+=3) { m0+=r_values[R_VALUE_OFFSET+i]; m1+=r_values[R_VALUE_OFFSET+i+1]; m2+=r_values[R_VALUE_OFFSET+i+2]; } switch(second_dimension) { case MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break; case MagneticMomentX : magnetization=m0; break; case MagneticMomentY : magnetization=m1; break; case MagneticMomentZ : magnetization=m2; break; } if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted)) more_work=false; } else { if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false; } //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD); shmem_double_put(r_values, evecs[r_group], 3*size_lsms, lsms_rank0[r_group]); //TODO check this shmem_fence(); num_steps--; running++; stepCount++; if(restrict_steps && num_steps<=0) more_work=false; if(restrict_steps) std::cout<<" "<<num_steps<<" steps remaining\n"; walltime = get_rtc() - walltime_0; if(restrict_time && walltime>=max_time) more_work=false; if(restrict_time) std::cout<<" "<<max_time-walltime<<" seconds remaining\n"; } else { // send an exit message to this instance of LSMS int r_group=(status.MPI_SOURCE-align)/comm_size; MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD); } if(step_out_flag && accepted) { step_out_file<<"# iteration "<<energies_accumulated<<std::endl; step_out_file.precision(15); step_out_file<<energies_accumulated<<std::endl; step_out_file<<r_values[0]<<" "<<r_values[1]<<std::endl; for(int j=0; j<3*size_lsms; j+=3) { step_out_file<<r_values[j+R_VALUE_OFFSET]<<" "<<r_values[j+R_VALUE_OFFSET+1] <<" "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl; } } // write restart file every restartWriteFrequency seconds if(walltime>nextWriteTime) { generator->writeState("WLrestart.jsn"); nextWriteTime+=restartWriteFrequency; } } generator->writeState("WLrestart.jsn"); /* if(evec_generation_mode==WangLandau_1d) (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state"); if(evec_generation_mode==ExhaustiveIsing) (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state"); */ for(int i=0; i<num_lsms; i++) free(evecs[i]); shfree(evecs); //shfree(r_values); } } if(world_rank==0) { if(step_out_flag) { step_out_file<<"# end\n-1\n" <<energy_accumulator/double(energies_accumulated)<<std::endl; step_out_file.close(); } std::cout<<"Finished all scheduled calculations. Freeing resources.\n"; std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n"; } if(num_lsms>1) { // make sure averyone arrives here: MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD); if(world_rank==0) { MPI_Comm_free(&local_comm); } else if(my_group>=0) { MPI_Comm_free(&local_comm); } } if(world_rank==0) { double walltime = get_rtc() - walltime_0; std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n"; std::cout<<" Monte-Carlo steps / walltime = " <<double(stepCount)/walltime<<"/sec\n"; } #ifdef USE_PAPI PAPI_stop_counters(papi_values,hw_counters); papi_values[hw_counters ] = PAPI_get_real_cyc()-papi_real_cyc_0; papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0; papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0; papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0; long long accumulated_counters[NUM_PAPI_EVENTS+4]; /* for(int i=0; i<hw_counters; i++) { printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]), std::cout,parameters.myrankWorld); } printline("PAPI real cycles : "+ttos(papi_values[hw_counters]), std::cout,parameters.myrankWorld); printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]), std::cout,parameters.myrankWorld); printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]), std::cout,parameters.myrankWorld); printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]), std::cout,parameters.myrankWorld); */ //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4, // MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD); shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4, comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2); if(world_rank==0) { for(int i=0; i<hw_counters; i++) { std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n"; } std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n"; std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n"; double gflops_papi = ((double)accumulated_counters[1])/ (1000.0*(double)papi_values[hw_counters+1]); double gflops_hw_double = ((double)accumulated_counters[2])/ (1000.0*(double)papi_values[hw_counters+1]); double gflops_hw_single = ((double)accumulated_counters[3])/ (1000.0*(double)papi_values[hw_counters+1]); double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]); std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n"; std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n"; std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n"; std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n"; } #endif //MPI_Finalize(); return 0; }
main(int argc, char *argv[]) { float **a,**b,**c; int n,n1,n2; int i,j; //double t0,t1; struct timeval t0,t1; long mtime, seconds, useconds; // Using PAPI - from countloop.c if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); // Suppressing output // printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM)) ehandler("Cannot count PAPI_L1_DCM."); size_t EVENT_COUNT = 3; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM }; long long values[EVENT_COUNT]; // Take size from args, not prompt // printf("Enter n: "); scanf("%d",&n); printf("n = %d\n",n); n = atoi(argv[1]); //printf("Enter n1: "); scanf("%d",&n1); printf("n1 = %d\n",n1); //printf("Enter n2: "); scanf("%d",&n2); printf("n2 = %d\n",n2); // To conform to the other matrix functions n1 = floor(sqrt(n)); n2 = n1; n = n1*n2; //printf("n = %d X %d = %d\n",n1,n2,n); a = matrix(1,n,1,n); for (i=1;i<=n;i++) for (j=1;j<=n;j++) a[i][j] = i+j; b = matrix(1,n,1,n); for (i=1;i<=n;i++) for (j=1;j<=n;j++) b[i][j] = i-j; //#ifdef PRINT //print_matrix(a,1,n,1,n); //printf("\n"); */ //print_matrix(b,1,n,1,n); //printf("\n"); */ //#endif //t0 = get_seconds(); //c = matrix_prod(n,n,n,n,a,b); //t1 = get_seconds(); //printf("Time for matrix_prod = %f sec\n",t1-t0); //t0 = get_seconds(); gettimeofday(&t0, NULL); // Start PAPI PAPI_start_counters(events, EVENT_COUNT); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); c = block_prod(n1,n1,n1,n2,n2,n2,a,b); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //t1 = get_seconds(); //printf("Time for block_prod = %f sec\n",t1-t0); gettimeofday(&t1, NULL); seconds = t1.tv_sec - t0.tv_sec; useconds = t1.tv_usec - t0.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; //printf("Time for matrix_prod = %f sec\n",t1-t0); printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1], values[2], mtime); }
int main(int argc, char **argv) { int quiet; int events[1],i; long long counts[1]; int retval,num_counters; char test_string[]="Testing PAPI_L1_DCA predefined event..."; // quiet=test_quiet(); retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) { // if (!quiet) printf("Error PAPI_library_init: %d\n",retval); // test_fail(test_string); } retval = PAPI_query_event(PAPI_L1_DCA); if (retval != PAPI_OK) { // if (!quiet) printf("PAPI_L1_DCA not available\n"); // test_fail(test_string); } num_counters=PAPI_num_counters(); events[0]=PAPI_L1_DCA; /*******************************************************************/ /* Test if the C compiler uses a sane number of data cache acceess */ /*******************************************************************/ #define ARRAYSIZE 1000 double array[ARRAYSIZE]; double aSumm = 0.0; if (!quiet) printf("Write test:\n"); PAPI_start_counters(events,1); for(i=0; i<ARRAYSIZE; i++) { array[i]=(double)i; } PAPI_stop_counters(counts,1); // if (!quiet) { printf("\tL1 D accesseses: %lld\n",counts[0]); printf("\tShould be roughly: %d\n",ARRAYSIZE); // } PAPI_start_counters(events,1); for(i=0; i<ARRAYSIZE; i++) { aSumm += array[i]; } PAPI_stop_counters(counts,1); // if (!quiet) { printf("Read test (%lf):\n",aSumm); printf("\tL1 D accesseses: %lld\n",counts[0]); printf("\tShould be roughly: %d\n",ARRAYSIZE); // } PAPI_shutdown(); // test_pass(test_string); return 0; }
void ipm_hpm_start() { int i,j,k,rv; #ifndef HPM_DISABLED #ifdef HPM_PAPI char errstring[PAPI_MAX_STR_LEN]; char event_name[PAPI_MAX_STR_LEN]; #endif if(task.hpm_eventset < 0) { return; } for(i=0;i<MAXSIZE_REGION;i++) { for(j=0;j<MAXSIZE_NEVENTSETS;j++) { for(k=0;k<MAXSIZE_HPMCOUNTERS;k++) { task.hpm_count[i][j][k] = 0; } } } #ifdef HPM_PAPI /* Initialize the low level PAPI library */ rv = PAPI_library_init(PAPI_VER_CURRENT); if(rv != PAPI_VER_CURRENT) { printf("IPM: %d papi_error library_init in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); perror("PAPI_library_init"); } if(rv == PAPI_OK) { if(task.flags & DEBUG && task.mpi_rank==0) { printf("IPM: %d PAPI_library_init in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); } } #ifdef CPU_PPC450D /* then we are on a bluegene P*/ if (node_rank != 0 ) return; #endif rv = PAPI_num_counters(); if(rv < 0) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); printf("IPM: %d papi_error num_counters in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); } if ((hwinfo = PAPI_get_hardware_info()) == NULL) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); printf("IPM: %d papi_error PAPI_get_hardware_info in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); } else { /* do something clever wrt. formal machine description */ } rv = PAPI_create_eventset(&papi_eventset[0]); if(rv != PAPI_OK) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); printf("IPM: %d papi_error create_eventset in hpmstart rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); } if(0) { printf("PAPI: JIE_DEBUG:: rank %d task.hpm_eventset is %d.\n", task.mpi_rank, task.hpm_eventset); fflush(stdout); } for(i=0;i<MAXSIZE_HPMCOUNTERS;i++) { if (papi_event[task.hpm_eventset][i] != -1) { rv = PAPI_query_event(papi_event[task.hpm_eventset][i]); if (rv != PAPI_OK) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); PAPI_event_code_to_name(papi_event[task.hpm_eventset][i],event_name); printf("IPM: %d papi_error query_event %s %d \"%s\"\n", task.mpi_rank, event_name, rv, PAPI_strerror(rv)); } rv = PAPI_add_event(papi_eventset[0], papi_event[task.hpm_eventset][i]); if(rv != PAPI_OK) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); PAPI_event_code_to_name(papi_event[task.hpm_eventset][i],event_name); printf("IPM: %d papi_error add_event %s %d \"%s\"\n", task.mpi_rank, event_name, rv, PAPI_strerror(rv)); } if(0) { PAPI_event_code_to_name(papi_event[task.hpm_eventset][i],event_name); printf("PAPI: JIE_DEBUG:: rank %d add event %s.\n", task.mpi_rank, event_name); fflush(stdout); } } } rv = PAPI_start(papi_eventset[0]); if (rv != PAPI_OK) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); printf("IPM: %d papi_error: start in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); task.flags &= ~IPM_HPM_ACTIVE; } #endif #ifdef HPM_PMAPI #ifdef AIX51 rv = pm_init(PM_VERIFIED|PM_UNVERIFIED|PM_CAVEAT|PM_GET_GROUPS, &pmapi_info, &pmapi_groups_info); #else rv = pm_initialize(PM_VERIFIED|PM_UNVERIFIED|PM_CAVEAT|PM_GET_GROUPS, &pmapi_info, &pmapi_groups_info,PM_CURRENT); #endif if(rv) { printf("IPM: %d pmapi_error: pm_initialize \n", task.mpi_rank); pm_error("IPM: pmapi : pm_initialize",rv); } for(i=0;i<pmapi_info.maxpmcs;i++) pmapi_prog.events[i]=COUNT_NOTHING; pmapi_prog.mode.w = 0; pmapi_prog.mode.b.user = 1; pmapi_prog.mode.b.proctree = 1; #ifndef POWER3 #ifdef CPU_POWER4 pmapi_prog.mode.b.is_group = 1; if(task.hpm_eventset == 0) { pmapi_prog.events[0] = 60; } if(task.hpm_eventset == 1) { pmapi_prog.events[0] = 59; } if(task.hpm_eventset == 2) { pmapi_prog.events[0] = 5; } if(task.hpm_eventset == 3) { pmapi_prog.events[0] = 58; } if(task.hpm_eventset == 4) { pmapi_prog.events[0] = 53; } #endif #ifdef CPU_POWER5 pmapi_prog.mode.b.is_group = 1; if(task.hpm_eventset == 0) { pmapi_prog.events[0] = 137; } #endif #ifdef CPU_POWER6 pmapi_prog.mode.b.is_group = 1; /* use all the pm_hpm* groups 186 - 195 */ pmapi_prog.events[0] = 186 + task.hpm_eventset; #endif #else for(i=0;i<MAXSIZE_HPMCOUNTERS;i++) { pmapi_prog.events[i] = pmapi_event[task.hpm_eventset][i]; } #endif rv = pm_set_program_mythread(&pmapi_prog); if(rv) { printf("IPM: %d pmapi_error: pm_set_program_mythread\n", task.mpi_rank); pm_error("IPM: pmapi : pm_set_program_mythread",rv); } rv = pm_start_mythread(); if(rv) { printf("IPM: %d pmapi_error: pm_start_mythread\n", task.mpi_rank); pm_error("IPM: pmapi : pm_start_mythread",rv); task.flags &= ~IPM_HPM_ACTIVE; } rv = pm_reset_data_mythread(); if(rv) { printf("IPM: %d pmapi_error: pm_reset_data_mythread\n", task.mpi_rank); } #endif #endif return; }
int main(int argc, char **argv) { int opts = global_options(&argc, &argv); int data_len; #ifdef HAVE_LIBPAPI int papi_counters = PAPI_num_counters(); if (papi_counters < papi_array_len) { fprintf(stderr, "%s: Warning: there are only %d hardware counters available!\n", progname, papi_counters); papi_array_len = papi_counters; } if (papi_test(papi_events, papi_array_len)) exit(1); for (int nv = 0; nv <= papi_array_len; ++nv) loop_calibration[nv] = 100000000; data_len = papi_array_len + 1; #else data_len = 2; #endif if (opts < 0 || argc < 2 || argc > 5) { print_help_and_exit(); } struct elim_params params; params.m = atoi(argv[1]); if (argc >= 3) params.n = atoi(argv[2]); else params.n = params.m; if (argc >= 4) params.algorithm = argv[3]; else params.algorithm = "ple"; if (argc >= 5) params.r = atoi(argv[4]); else params.r = MIN(params.m, params.n); srandom(17); unsigned long long data[16]; for (int i = 0; i < 4; ++i) run_nothing((void*)¶ms, data, &data_len); run_bench(run, (void*)¶ms, data, data_len); double cc_per_op = ((double)data[1])/ ( (double)params.m * (double)params.n * powl((double)params.r,0.807) ); printf("m: %5d, n: %5d, last r: %5d, cpu cycles: %12llu, cc/(mnr^0.807): %.5lf, ", params.m, params.n, params.r, data[1], cc_per_op); print_wall_time(data[0] / 1000000.0); printf(", "); print_cpu_time(data[1] / (double)cpucycles_persecond()); printf("\n"); #ifdef HAVE_LIBPAPI for (int n = 1; n < data_len; ++n) { double tmp = ((double)data[n]) / powl((double)params.n,2.807); printf("%20s (%20llu) per bit (divided by n^2.807): %15.5f\n", papi_event_name(papi_events[n - 1]), data[n], tmp); } #endif }
main(int argc, char *argv[]) { float **a,**b,**c; int n; int NB; int i,j; int x; //double t0,t1; struct timeval t0,t1; long mtime, seconds, useconds; // Using PAPI - from countloop.c if (PAPI_VER_CURRENT != PAPI_library_init(PAPI_VER_CURRENT)) ehandler("PAPI_library_init error."); const size_t EVENT_MAX = PAPI_num_counters(); // Suppressing output // printf("# Max counters = %zd\n", EVENT_MAX); if (PAPI_OK != PAPI_query_event(PAPI_TOT_INS)) ehandler("Cannot count PAPI_TOT_INS."); if (PAPI_OK != PAPI_query_event(PAPI_FP_OPS)) ehandler("Cannot count PAPI_FP_OPS."); if (PAPI_OK != PAPI_query_event(PAPI_L1_DCM)) ehandler("Cannot count PAPI_L1_DCM."); size_t EVENT_COUNT = 3; int events[] = { PAPI_TOT_INS, PAPI_FP_OPS, PAPI_L1_DCM }; long long values[EVENT_COUNT]; // Take size from args, not prompt // printf("Enter n: "); scanf("%d",&n); printf("n = %d\n",n); n = atoi(argv[1]); NB = atoi(argv[2]); a = matrix(1,n,1,n); for (i=1; i<=n; i++) for (j=1; j<=n; j++) a[i][j] = i+j; b = matrix(1,n,1,n); for (i=1; i<=n; i++) for (j=1; j<=n; j++) b[i][j] = i-j; //t0 = get_seconds(); gettimeofday(&t0, NULL); // Start PAPI PAPI_start_counters(events, EVENT_COUNT); if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //for (x=0;x<1000;x++){ c = matrix_prod(n,n,n,n,a,b,NB); //} if (PAPI_OK != PAPI_read_counters(values, EVENT_COUNT)) ehandler("Problem reading counters."); //t1 = get_seconds(); gettimeofday(&t1, NULL); seconds = t1.tv_sec - t0.tv_sec; useconds = t1.tv_usec - t0.tv_usec; mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; //printf("Time for matrix_prod = %f sec\n",t1-t0); printf("%d\t%lld\t%lld\t%lld\t%ld\n", n, values[0], values[1], values[2], mtime); }
void ipm_pthread_hpm_init() { int i,j,k,rv; #ifdef HPM_PAPI char errstring[PAPI_MAX_STR_LEN]; char event_name[PAPI_MAX_STR_LEN]; #endif //task.flags |= DEBUG; if(task.hpm_eventset < 0) { return; } for(i=0;i<MAXSIZE_REGION;i++) { for(j=0;j<MAXSIZE_NEVENTSETS;j++) { for(k=0;k<MAXSIZE_HPMCOUNTERS;k++) { task.hpm_count[i][j][k] = 0; } } } #ifdef HPM_PAPI for( i=0; i<MAXSIZE_NTHREADS; i++ ) { papi_eventset[i]=PAPI_NULL; } /* Initialize the low level PAPI library */ rv = PAPI_library_init(PAPI_VER_CURRENT); if(rv != PAPI_VER_CURRENT) { printf("IPM: %d papi_error library_init in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); perror("PAPI_library_init"); } rv = PAPI_thread_init(pthread_self); if( rv != PAPI_OK ) if(task.flags & DEBUG && task.mpi_rank==0) { printf("IPM: %d PAPI_thread_init in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); perror("PAPI_thread_init"); } rv = PAPI_num_counters(); if(rv < 0) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); printf("IPM: %d papi_error num_counters in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); } if ((hwinfo = PAPI_get_hardware_info()) == NULL) { PAPI_perror(rv, errstring, PAPI_MAX_STR_LEN); printf("IPM: %d papi_error PAPI_get_hardware_info in hpm_start rv=%d \"%s\"\n", task.mpi_rank, rv, PAPI_strerror(rv)); } else { /* do something clever wrt. formal machine description */ } return; }
int main(int argc, char *argv[]) { float rtime1, rtime2, ptime1, ptime2, mflops; long long flpops; unsigned long int tid; int num_hwcntrs = 0; int fip = 0, retval; float real_time, proc_time; long long flpins; int i; unsigned int EventSet = PAPI_NULL; int count = 0, err_count = 0; PAPI_event_info_t info; long long ( values2[2] )[2]; long long min, max; int PAPI_event, mythreshold = THRESHOLD; char event_name1[PAPI_MAX_STR_LEN]; const PAPI_hw_info_t *hw_info = NULL; int num_events, mask; int num_flops = NUM_FLOPS; long long elapsed_us, elapsed_cyc; tests_quiet( argc, argv ); /* Set TESTS_QUIET variable */ retval = PAPI_library_init( PAPI_VER_CURRENT ); if ( retval != PAPI_VER_CURRENT ) test_fail( __FILE__, __LINE__, "PAPI_library_init", retval ); retval = PAPI_create_eventset( &EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_create_eventset", retval ); /* Get hardware info */ hw_info = PAPI_get_hardware_info( ); if ( hw_info == NULL ) test_fail( __FILE__, __LINE__, "PAPI_get_hardware_info", 2 ); EventSet = add_two_nonderived_events( &num_events, &PAPI_event, &mask ); printf("Using %#x for the overflow event\n",PAPI_event); if ( PAPI_event == PAPI_FP_INS ) { mythreshold = THRESHOLD; } else { #if defined(linux) mythreshold = ( int ) hw_info->cpu_max_mhz * 20000; #else mythreshold = THRESHOLD * 2; #endif } retval = PAPI_start( EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_start", retval ); do_flops( NUM_FLOPS ); /* stop the calibration run */ retval = PAPI_stop( EventSet, values2[0] ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_stop", retval ); /* set up overflow handler */ retval = PAPI_overflow( EventSet, PAPI_event, mythreshold, 0, handler ); if ( retval != PAPI_OK ) { test_fail( __FILE__, __LINE__, "PAPI_overflow", retval ); } /* Start overflow run */ retval = PAPI_start( EventSet ); if ( retval != PAPI_OK ) { test_fail( __FILE__, __LINE__, "PAPI_start", retval ); } do_flops( num_flops ); /* stop overflow run */ retval = PAPI_stop( EventSet, values2[1] ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_stop", retval ); retval = PAPI_overflow( EventSet, PAPI_event, 0, 0, handler ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_overflow", retval ); if ( !TESTS_QUIET ) { if ( ( retval = PAPI_event_code_to_name( PAPI_event, event_name1 ) ) != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_event_code_to_name", retval ); printf( "Test case: Overflow dispatch of 2nd event in set with 2 events.\n" ); printf( "---------------------------------------------------------------\n" ); printf( "Threshold for overflow is: %d\n", mythreshold ); printf( "Using %d iterations\n", num_flops ); printf( "-----------------------------------------------\n" ); printf( "Test type : %16d%16d\n", 1, 2 ); printf( OUT_FMT, event_name1, ( values2[0] )[1], ( values2[1] )[1] ); printf( OUT_FMT, "PAPI_TOT_CYC", ( values2[0] )[0], ( values2[1] )[0] ); printf( "Overflows : %16s%16d\n", "", total ); printf( "-----------------------------------------------\n" ); } retval = PAPI_cleanup_eventset( EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval ); retval = PAPI_destroy_eventset( &EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_destroy_eventset", retval ); if ( !TESTS_QUIET ) { printf( "Verification:\n" ); #if defined(linux) || defined(__ia64__) || defined(_POWER4) num_flops *= 2; #endif if ( PAPI_event == PAPI_FP_INS || PAPI_event == PAPI_FP_OPS ) { printf( "Row 1 approximately equals %d %d\n", num_flops, num_flops ); } printf( "Column 1 approximately equals column 2\n" ); printf( "Row 3 approximately equals %u +- %u %%\n",( unsigned ) ( ( values2[0] )[1] / ( long long ) mythreshold ),( unsigned ) ( OVR_TOLERANCE * 100.0 ) ); } min = ( long long ) ( ( ( double ) values2[0][1] * ( 1.0 - OVR_TOLERANCE ) ) / ( double ) mythreshold ); max = ( long long ) ( ( ( double ) values2[0][1] * ( 1.0 + OVR_TOLERANCE ) ) / ( double ) mythreshold ); printf( "Overflows: total(%d) > max(%lld) || total(%d) < min(%lld) \n", total, max, total, min ); if ( total > max || total < min ) test_fail( __FILE__, __LINE__, "Overflows", 1 ); printf("Initial thread id is: %lu\n",tid); /* Initialize the PAPI library and get the number of counters available */ if ((num_hwcntrs = PAPI_num_counters()) <= 0) handle_error(1); /* The installation supports PAPI, but has no counters */ if ((num_hwcntrs = PAPI_num_counters()) == 0 ) fprintf(stderr,"Info:: This machine does not provide hardware counters."); printf("This system has %d available counters.\n", num_hwcntrs); if (num_hwcntrs > 2) num_hwcntrs = 2; /* Start counting events */ if (PAPI_start_counters(Events, num_hwcntrs) != PAPI_OK) handle_error(1); if (argc != 8) { printf("\nError :: Ejecutar como : a.out archivo_BD Num_elem archivo_queries Num_queries N_THREADS numero_K Dimension_objetos\n"); return 0; } TOPK = atoi(argv[6]); DIM = atoi(argv[7]); double **DB; double **Consultas; //Cola de consultas int N_QUERIES, N_DB; char str_f[256]; double dato[DIM]; int j; FILE *f_dist, *fquery; Elem *heap, e_temp,*answer; int *acum, N_THREADS; //N_THREADS es el nro. de threads con el que se lanzará la región paralela N_THREADS = atoi(argv[5]); //N_QUERIES es el nro. de consultas N_QUERIES = atoi(argv[4]); N_DB = atoi(argv[2]); printf("\nN_QUERIES = %d\nN_THREADS = %d\n", N_QUERIES, N_THREADS); fflush(stdout); acum = (int *) malloc(sizeof (int)*N_THREADS); for (i = 0; i < N_THREADS; i++) acum[i] = 0; sprintf(str_f, "%s", argv[1]); printf("\nAbriendo %s... ", argv[1]); fflush(stdout); f_dist = fopen(str_f, "r"); printf("OK\n"); fflush(stdout); Consultas = (double **) malloc(sizeof (double *)*N_QUERIES); for (i = 0; i < N_QUERIES; i++) Consultas[i] = (double *) malloc(sizeof (double)*DIM); DB = (double **) malloc(sizeof (double *)*N_DB); for (i = 0; i < N_DB; i++) DB[i] = (double *) malloc(sizeof (double)*DIM); answer = (Elem *)malloc(sizeof(Elem)*N_QUERIES*TOPK); printf("\nCargando DB... "); fflush(stdout); for (i = 0; i < N_DB; i++) { //Usar leedato_cophir() cuando se utilice la BD Cophir para no tener problemas con las "," //if (leedato_cophir(dato, f_dist) == ERROR || feof(f_dist)) if (leedato(dato, f_dist) == ERROR || feof(f_dist)) { printf("\n\nERROR :: N_DB mal establecido\n\n"); fflush(stdout); fclose(f_dist); break; } copiavalor(DB[i], dato); } fclose(f_dist); printf("OK\n"); fflush(stdout); if ((fquery = fopen(argv[3], "r")) == NULL) printf("Error al abrir para lectura el archivo de qeuries: %s\n", argv[3]); else printf("Abriendo para lectura %s\n", argv[3]); printf("\nCargando Consultas... "); fflush(stdout); for (i = 0; i < N_QUERIES; i++) { //Usar leedato_cophir() cuando se utilice la BD Cophir para no tener problemas con las "," //if (leedato_cophir(dato, fquery) == ERROR || feof(fquery)) if (leedato(dato, fquery) == ERROR || feof(fquery)) { printf("\n\nERROR :: N_QUERIES mal establecido, Menos queries que las indicadas\n\n"); fflush(stdout); fclose(fquery); break; } copiavalor(Consultas[i], dato); } fclose(fquery); printf("OK\n"); fflush(stdout); PAPI_start_counters((int*) Events, NUM_EVENTS); omp_set_num_threads(N_THREADS); elapsed_us = PAPI_get_real_usec( ); elapsed_cyc = PAPI_get_real_cyc( ); retval = PAPI_thread_init( ( unsigned long ( * )( void ) ) ( omp_get_thread_num ) ); if ( retval != PAPI_OK ) { if ( retval == PAPI_ECMP ) test_skip( __FILE__, __LINE__, "PAPI_thread_init", retval ); else test_fail( __FILE__, __LINE__, "PAPI_thread_init", retval ); } #pragma omp parallel shared(Consultas, DB, N_QUERIES, N_DB, N_THREADS, acum, DIM) { float real_time; struct timeval t1, t2; int i, j; Elem *heap, e_temp; double d; int n_elem = 0; int trid = omp_get_thread_num(); //ID del thread int procs = omp_get_num_threads(); //Nro. total de threads double suma = 0; suma = 0; heap = (Elem *) malloc(sizeof (Elem) * TOPK); #pragma omp barrier #pragma omp master { gettimeofday(&t1, 0); } //Cada hilo accede a un subconjunto de las consultas. Cada hio accede de manera circular al arreglo de consultas. for (i = trid; i < N_QUERIES; i += procs) { n_elem = 0; for (j = 0; j < N_DB; j++) { d = distancia(Consultas[i], DB[j]); //Si la distancia del objeto a la consulta es menor que la raíz del heap, entonces se inserta en el heap. La raíz siempre mantiene la mayor de las distancias if(n_elem<TOPK){ e_temp.dist = d; e_temp.ind = j; inserta2(heap, &e_temp, &n_elem); } if (n_elem==TOPK){ if (d < topH(heap, &n_elem)) { e_temp.dist = d; e_temp.ind = j; //Si el heap no está lleno, se inserta el elemento if (n_elem < TOPK) inserta2(heap, &e_temp, &n_elem); //Si el heap está lleno, se inserta el elemento nuevo y se saca el que era antes de mayor de distancia. popush2() hace las operaciones de sacar el elemento mayor e insertar el nuevo. else popush2(heap, &n_elem, &e_temp); }} } //En este punto del código se tienen los K elemntos más cercanos a la consulta en 'heap'. Se pueden extraer con extraer2() for (j = 0; j < TOPK ; j++) { extrae2(heap, &n_elem, &e_temp); answer[i*TOPK+j].ind = e_temp.ind; answer[i*TOPK+j].dist = e_temp.dist; } //Realizamos una operación con los resultados para que el compilador no evite hacer instrucciones que considere que el usuario no utiliza. Simplemente cada hilo suma las distancias de los elementos mas cercanos a la consulta } Thread( 1000000 * ( tid + 1 ) ); fflush(stdout); #pragma omp barrier #pragma omp master { if ( fip > 0 ) { /* Setup PAPI library and begin collecting data from the counters */ if ( fip == 1 ) { if ( ( retval = PAPI_flips( &real_time, &proc_time, &flpins, &mflops ) ) < PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_flips", retval ); } else { if ( ( retval = PAPI_flops( &real_time, &proc_time, &flpins, &mflops ) ) < PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_flops", retval ); } gettimeofday(&t2, 0); real_time = (t2.tv_sec - t1.tv_sec) + (float) (t2.tv_usec - t1.tv_usec) / 1000000; Salida_Multihilo = fopen("Salida_Multihilo.txt", "w"); for (i = 0; i < N_QUERIES; ++i){ fprintf(Salida_Multihilo, "Consulta id:: %d\n",i); for (j = 0; j < TOPK; ++j){ fprintf(Salida_Multihilo,"ind = %d :: dist = %f\n",answer[(i*TOPK)+j].ind,answer[(i*TOPK)+j].dist); } fprintf(Salida_Multihilo, "---------------------------------\n"); } fclose(Salida_Multihilo); printf("\n\nK = %d", TOPK); printf("\nReal Time = %f segundos.\n", real_time); fflush(stdout); if ( fip == 1 ) { if ( ( retval = PAPI_flips( &real_time, &proc_time, &flpins, &mflops ) ) < PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_flips", retval ); } else { if ( ( retval = PAPI_flops( &real_time, &proc_time, &flpins, &mflops ) ) < PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_flops", retval ); } if ( !TESTS_QUIET ) { if ( fip == 1 ) { printf( "Real_time: %f Proc_time: %f Total flpins: ", real_time, proc_time ); } else { printf( "Real_time: %f Proc_time: %f Total flpops: ", real_time, proc_time ); } printf( LLDFMT, flpins ); printf( " MFLOPS: %f\n", mflops ); } } } free(heap); }//end pragma omp parallel elapsed_cyc = PAPI_get_real_cyc( ) - elapsed_cyc; elapsed_us = PAPI_get_real_usec( ) - elapsed_us; if ( !TESTS_QUIET ) { printf( "Master real usec : \t%lld\n", elapsed_us ); printf( "Master real cycles : \t%lld\n", elapsed_cyc ); } const PAPI_hw_info_t *hwinfo = NULL; const PAPI_mh_tlb_info_t *mhinfo = NULL; const PAPI_mh_cache_info_t *mhcacheinfo = NULL; const PAPI_mh_level_t *mhlevel = NULL; if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) exit(1); if ((hwinfo = PAPI_get_hardware_info()) == NULL) exit(1); if ((mhinfo = PAPI_get_hardware_info()) == NULL) exit(1); if ((mhcacheinfo = PAPI_get_hardware_info()) == NULL) exit(1); if ((mhlevel = PAPI_get_hardware_info()) == NULL) exit(1); printf("\n\nA continuación información actual del equipo\n\n"); printf("MH Type %d - Num entries %d - Associativity %d \n",mhinfo->type, mhinfo->num_entries, mhinfo->associativity); printf("Cache MH type %d size %d line size %d num_lines %d Associativity %d\n\n",mhcacheinfo->type, mhcacheinfo->size,mhcacheinfo->line_size, mhcacheinfo->num_lines, mhcacheinfo->associativity); retval=papi_print_header("Available PAPI preset and user defined events plus hardware information.\n",&hwinfo ); printf("Total hardware flops = %lld\n",(float)values[1]); printf("L2 data cache misses is %lld\n", values[0]); retval = PAPI_stop_counters(values, NUM_EVENTS); return 0; }
/** * Called by the CBTF collector service in order to start data collection. */ void cbtf_collector_start(const CBTF_DataHeader* header) { /** * Start sampling. * * Starts hardware counter (HWC) sampling for the thread executing this * function. Initializes the appropriate thread-local data structures and * then enables the sampling counter. * * @param arguments Encoded function arguments. */ /* Create and access our thread-local storage */ #ifdef USE_EXPLICIT_TLS TLS* tls = malloc(sizeof(TLS)); Assert(tls != NULL); CBTF_SetTLS(TLSKey, tls); #else TLS* tls = &the_tls; #endif Assert(tls != NULL); tls->defer_sampling=false; #ifndef NDEBUG IsCollectorDebugEnabled = (getenv("CBTF_DEBUG_COLLECTOR") != NULL); IsCollectorDetailsDebugEnabled = (getenv("CBTF_DEBUG_COLLECTOR_DETAILS") != NULL); #if defined (HAVE_OMPT) IsOMPTDebugEnabled = (getenv("CBTF_DEBUG_COLLECTOR_OMPT") != NULL); #endif #endif /* Decode the passed function arguments */ // Need to handle the arguments... CBTF_hwcsamp_start_sampling_args args; memset(&args, 0, sizeof(args)); args.sampling_rate = 100; /* First set defaults */ int hwcsamp_rate = 100; char* hwcsamp_papi_event = "PAPI_TOT_CYC,PAPI_TOT_INS"; #if defined (CBTF_SERVICE_USE_OFFLINE) char* hwcsamp_event_param = getenv("CBTF_HWCSAMP_EVENTS"); if (hwcsamp_event_param != NULL) { hwcsamp_papi_event=hwcsamp_event_param; } const char* sampling_rate = getenv("CBTF_HWCSAMP_RATE"); if (sampling_rate != NULL) { hwcsamp_rate=atoi(sampling_rate); } args.collector = 1; args.experiment = 0; tls->data.interval = (uint64_t)(1000000000) / (uint64_t)(hwcsamp_rate);; #endif /* Initialize the actual data blob */ memcpy(&tls->header, header, sizeof(CBTF_DataHeader)); initialize_data(tls); /* We can not assign mpi rank in the header at this point as it may not * be set yet. assign an integer tid value. omp_tid is used regardless of * whether the application is using openmp threads. * libmonitor uses the same numbering scheme as openmp. */ tls->header.omp_tid = monitor_get_thread_num(); tls->header.id = strdup(cbtf_collector_unique_id); tls->header.time_begin = CBTF_GetTime(); #ifndef NDEBUG if (IsCollectorDebugEnabled) { fprintf(stderr,"[%ld,%d] ENTER cbtf_collector_start\n",tls->header.pid,tls->header.omp_tid); } #endif if(hwcsamp_papi_init_done == 0) { #ifndef NDEBUG if (IsCollectorDebugEnabled) { fprintf(stderr,"[%ld,%d] cbtf_collector_start: initialize papi\n",tls->header.pid,tls->header.omp_tid); } #endif CBTF_init_papi(); tls->EventSet = PAPI_NULL; tls->data.clock_mhz = (float) hw_info->mhz; hwcsamp_papi_init_done = 1; } else { tls->data.clock_mhz = (float) hw_info->mhz; } /* PAPI SETUP */ CBTF_Create_Eventset(&tls->EventSet); int rval = PAPI_OK; #ifndef NDEBUG if (IsCollectorDebugEnabled) { fprintf(stderr, "PAPI Version: %d.%d.%d.%d\n", PAPI_VERSION_MAJOR( PAPI_VERSION ), PAPI_VERSION_MINOR( PAPI_VERSION ), PAPI_VERSION_REVISION( PAPI_VERSION ), PAPI_VERSION_INCREMENT( PAPI_VERSION ) ); fprintf(stderr,"System has %d hardware counters.\n", PAPI_num_counters()); } #endif /* In Component PAPI, EventSets must be assigned a component index * before you can fiddle with their internals. 0 is always the cpu component */ #if (PAPI_VERSION_MAJOR(PAPI_VERSION)>=4) rval = PAPI_assign_eventset_component( tls->EventSet, 0 ); if (rval != PAPI_OK) { CBTF_PAPIerror(rval,"CBTF_Create_Eventset assign_eventset_component"); return; } #endif /* NOTE: if multiplex is turned on, papi internaly uses a SIGPROF handler. * Since we are sampling potentially with SIGPROF or now SIGRTMIN and we * prefer to limit our events to 6, we do not need multiplexing. */ if (getenv("CBTF_HWCSAMP_MULTIPLEX") != NULL) { #if !defined(RUNTIME_PLATFORM_BGP) rval = PAPI_set_multiplex( tls->EventSet ); if ( rval == PAPI_ENOSUPP) { fprintf(stderr,"CBTF_Create_Eventset: Multiplex not supported\n"); } else if (rval != PAPI_OK) { CBTF_PAPIerror(rval,"CBTF_Create_Eventset set_multiplex"); } #endif } /* TODO: check return values of direct PAPI calls * and handle them as needed. */ /* Rework the code here to call PAPI directly rather than * call any OPENSS helper functions due to inconsitent * behaviour seen on various lab systems */ int eventcode = 0; rval = PAPI_OK; if (hwcsamp_papi_event != NULL) { char *tfptr, *saveptr=NULL, *tf_token; tfptr = strdup(hwcsamp_papi_event); for (tf_token = strtok_r(tfptr, ",", &saveptr); tf_token != NULL; tf_token = strtok_r(NULL, ",", &saveptr) ) { PAPI_event_name_to_code(tf_token,&eventcode); rval = PAPI_add_event(tls->EventSet,eventcode); if (rval != PAPI_OK) { CBTF_PAPIerror(rval,"CBTF_Create_Eventset PAPI_event_name_to_code"); } } if (tfptr) free(tfptr); } else { PAPI_event_name_to_code("PAPI_TOT_CYC",&eventcode); rval = PAPI_add_event(tls->EventSet,eventcode); PAPI_event_name_to_code("PAPI_TOT_INS",&eventcode); rval = PAPI_add_event(tls->EventSet,eventcode); } #if defined (HAVE_OMPT) /* these are ompt specific.*/ /* initialize the flags and counts for idle,wait_barrier. */ tls->thread_idle = tls->thread_wait_barrier = tls->thread_barrier = false; #endif /* Begin sampling */ tls->header.time_begin = CBTF_GetTime(); CBTF_Start(tls->EventSet); CBTF_Timer(tls->data.interval, hwcsampTimerHandler); }
int main(int argc, char **argv) { int size,rank, left, right, you, ndata=127,ndata_max=127,seed; int rv; long long int i,j,k; unsigned long long int nflop=0,nmem=1,nsleep=0,nrep=1, myflops; char *env_ptr; double *sbuf, *rbuf,*x; MPI_Status *s; MPI_Request *r; time_t ts; #ifdef HPM if((rv = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT ) { fprintf(stderr, "Error: %d %s\n",rv, errstring); exit(1); } if ((num_hwcntrs = PAPI_num_counters()) < PAPI_OK) { printf("There are no counters available. \n"); exit(1); } if ( (rv = PAPI_start_counters(events, 2)) != PAPI_OK) { fprintf(stdout, "ERROR PAPI_start_counters rv=%d\n", rv); exit(rv); } #endif seed = time(&ts); flags |= DOMPI; while(--argc && argv++) { if(!strcmp("-v",*argv)) { flags |= DOVERBOSE; } else if(!strcmp("-n",*argv)) { --argc; argv++; nflop = atol(*argv); } else if(!strcmp("-N",*argv)) { --argc; argv++; nrep = atol(*argv); } else if(!strcmp("-d",*argv)) { --argc; argv++; ndata_max = ndata = atol(*argv); } else if(!strcmp("-m",*argv)) { --argc; argv++; nmem = atol(*argv); } else if(!strcmp("-s",*argv)) { --argc; argv++; nsleep = atol(*argv); } else if(!strcmp("-spray",*argv)) { flags |= DOSPRAY; } else if(!strcmp("-c",*argv)) { flags |= CORE; } else if(!strcmp("-r",*argv)) { flags |= REGION; } else if(!strcmp("-stair",*argv)) { flags |= STAIR_RANK; } else if(!strcmp("-stair_region",*argv)) { flags |= STAIR_REGION; } else if(!strcmp("-nompi",*argv)) { flags &= ~DOMPI; } } if(flags & DOMPI) { MPI_Init(&argc,&argv); /* MPI_Init(&argc,&argv); */ MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); } if(nmem) { nmem = (nmem*1024*1024/sizeof(double)); x = (double *)malloc((size_t)(nmem*sizeof(double))); for(j=0;j<nrep;j++) { for(i=0;i<nmem;i++) { x[i] = i; } for(i=0;i<nmem;i++) { x[i] = i*x[i]; } if(x[nmem-1]*x[nmem-1] < 0) { printf("trickster\n"); } } if(0) free((char *)x); } #ifdef IPM if(flags & REGION && rank > -1 ) MPI_Pcontrol(1,"region_zzzzzzzzzzzZz"); #endif if(nflop) { x = (double *)malloc((size_t)(10*sizeof(double))); j = k = 0; for(i=0;i<10;i++) { x[i] = 1.0; } if(flags & STAIR_RANK) { myflops = (rank*nflop)/size; } else { myflops = nflop; } for(i=0;i<nflop;i++) { x[j] = x[j]*x[k]; j = ((i%9)?(j+1):(0)); k = ((i%8)?(k+1):(0)); } free((char *)x); } if(nsleep) { sleep(nsleep); } #ifdef IPM if(flags & REGION && rank > -1 ) MPI_Pcontrol(-1,"region_zzzzzzzzzzzZz"); #endif if(nmem<nflop) nmem=nflop; if(nflop>1) printf("FLOPS = %lld BYTES = %lld\n", nflop, nmem); fflush(stdout); if(flags & CORE) { for(i=0;;i++) { x[i] = x[i*i-1000]; } } env_ptr = getenv("IPM_SOCKET"); if(env_ptr) { printf("IPM: %d IPM_SOCKET in app %s\n", rank, env_ptr); } if(flags & DOMPI) { s = (MPI_Status *)malloc((size_t)(sizeof(MPI_Status)*2*size)); r = (MPI_Request *)malloc((size_t)(sizeof(MPI_Request)*2*size)); sbuf = (double *)malloc((size_t)(ndata_max*sizeof(double))); rbuf = (double *)malloc((size_t)(ndata_max*sizeof(double))); for(i=0;i<ndata_max;i++) { sbuf[i] = rbuf[i] = i; } MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); srand48(seed); for(i=0;i<nrep;i++) { MPI_Bcast(sbuf,ndata_max,MPI_DOUBLE,0,MPI_COMM_WORLD); } if(size>1) { if(!rank) {left=size-1;} else { left = rank-1;} if(rank == size-1) { right=0;} else {right=rank+1;} you = (rank < size/2)?(rank+size/2):(rank-size/2); for(i=0;i<nrep;i++) { if(flags & DOSPRAY) { ndata = (long int)(drand48()*ndata_max)+1; } MPI_Sendrecv(sbuf,ndata,MPI_DOUBLE,right,1,rbuf,ndata,MPI_DOUBLE,left,1,MPI_COMM_WORLD,s); MPI_Sendrecv(sbuf,ndata,MPI_DOUBLE,left,1,rbuf,ndata,MPI_DOUBLE,right,1,MPI_COMM_WORLD,s); #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_a"); #endif MPI_Barrier(MPI_COMM_WORLD); MPI_Sendrecv(sbuf,ndata,MPI_DOUBLE,left,1,rbuf,ndata,MPI_DOUBLE,right,1,MPI_COMM_WORLD,s); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Isend(sbuf,ndata,MPI_DOUBLE,you,0,MPI_COMM_WORLD, r); MPI_Recv(rbuf,ndata,MPI_DOUBLE,MPI_ANY_SOURCE,0,MPI_COMM_WORLD, s); MPI_Wait(r,s); MPI_Irecv(rbuf,ndata,MPI_DOUBLE,MPI_ANY_SOURCE,0,MPI_COMM_WORLD,r); MPI_Send(sbuf,ndata,MPI_DOUBLE,you,0,MPI_COMM_WORLD); MPI_Wait(r,s); for(j=0;j<size;j++) { MPI_Isend(sbuf+j%ndata_max,1,MPI_DOUBLE,j,4,MPI_COMM_WORLD, r+j); MPI_Irecv(rbuf+j%ndata_max,1,MPI_DOUBLE,j,4,MPI_COMM_WORLD,r+size+j); } MPI_Waitall(2*size,r,s); /* for(j=0;j<size;j++) { printf("rep %d stat %d %d %d\n",i, j, s[j].MPI_SOURCE, s[j+size].MPI_SOURCE); } */ #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_a"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_b"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_b"); #endif if(1) { #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_c"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_c"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_d"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_d"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_e"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_e"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_f"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_f"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_g"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_g"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_h"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_h"); #endif #ifdef IPM if(flags & REGION) MPI_Pcontrol(1,"region_i"); #endif MPI_Allreduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(sbuf,rbuf,ndata-1,MPI_DOUBLE, MPI_SUM, 1, MPI_COMM_WORLD); #ifdef IPM if(flags & REGION) MPI_Pcontrol(-1,"region_i"); #endif } } } MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); } #ifdef HPM if ((rv=PAPI_stop_counters(values, 2)) != PAPI_OK) { fprintf(stdout, "ERROR PAPI_stop_counters rv=%d\n", rv); exit(rv); } printf("PAPI: total instruction/cycles %lld/%lld %.3e \n", values[0], values[1], values[0]/(values[1]*1.0) ); #endif return 0; }
int main( int argc, char** argv) { int rank, size, res, i, j, k; char* ptr; char* data; struct timeval tv_start, tv_end; double theTime; double** times; char* filename = NULL; FILE* fd = NULL; #if PAPI int* events = NULL; int num_hwcntrs = 0; long long int* values = NULL; int error = 0; #endif int nb = NBXP; int buffsize = DEFAULTSIZE; unsigned int it; int myBuddyLeft, myBuddyRight, myBuddy; int getOnly, putOnly; getOnly = 0 ; putOnly = 0; if( argc >= 4 ) { if( 0 == strncmp( argv[3], "g", 1 ) ) { getOnly = 1; } if( 0 == strncmp( argv[3], "p", 1 ) ) { putOnly = 1; } if( 0 == strncmp( argv[3], "o", 1 ) ) { asprintf( &filename, "%s.%d", "putget", getpid() ); } } if( argc >= 5 ) { if( 0 == strncmp( argv[4], "o", 1 ) ) { asprintf( &filename, "%s.%d", "putget", getpid() ); } } if( argc >= 3 ) buffsize = atoi( argv[2] ) ; if( argc >= 2 ) nb = atoi( argv[1] ) ; #if PAPI /* Initialize the PAPI library and get the number of counters available */ if( (num_hwcntrs = PAPI_num_counters()) <= PAPI_OK ){ printf( "Error while retrieving counters info\n" ); return EXIT_FAILURE; } printf( "This system has %d available counters.\n", num_hwcntrs ); if( num_hwcntrs < NBCOUNTERS ) { printf( "Not enough counters available (%d available, I need %d)\n", num_hwcntrs, NBCOUNTERS ); error = 1; goto finish; } num_hwcntrs = NBCOUNTERS; values = (long long int*) malloc( num_hwcntrs * sizeof( long long int ) ); events = (int*) malloc( num_hwcntrs * sizeof( int ) ); events[0] = PAPI_TOT_CYC; events[1] = PAPI_TOT_INS; /* Start counting events */ int rc; rc = PAPI_start_counters( events, num_hwcntrs ); if ( rc != PAPI_OK){ printf( "error starting counting events\n" ); switch( rc ){ case PAPI_EINVAL: printf( "One or more of the arguments is invalid.\n" ); break; case PAPI_EISRUN: printf( "Counters have already been started, you must call PAPI_stop_counters() before you call this function again.\n" ); break; case PAPI_ESYS: printf( "A system or C library call failed inside PAPI, see the errno variable.\n" ); break; case PAPI_ENOMEM: printf( "Insufficient memory to complete the operation.\n" ); break; case PAPI_ECNFLCT: printf( "The underlying counter hardware cannot count this event and other events in the EventSet simultaneously.\n" ); break; case PAPI_ENOEVNT: printf( "The PAPI preset is not available on the underlying hardware.\n" ); break; } error = 1; goto finish; } #endif start_pes( 0 ); rank = shmem_my_pe(); size = shmem_n_pes(); if( size < 2 ) { fprintf( stderr, "We need at least 2 processes. Exiting.\n" ); return EXIT_SUCCESS; } if( rank == 0 || rank == 1 ) { if( NULL != filename ) { fd = fopen( filename, "w+" ); if( NULL == fd ) { fprintf( stderr, "pb ouverture fichier sortie\n" ); return EXIT_FAILURE; } } if( rank == 0 ) { printf( "buff size is %d, %d experiments\n", buffsize, nb ); } } //printf( "Hello, I am process %d in %d \n", rank, size ); // shmem_barrier_all(); sleep( 2 ); /* check */ myBuddyLeft = ( rank + 1 ) % size; myBuddyRight = ( rank + size - 1 ) % size; /* res = shmem_pe_accessible( myBuddyLeft ); if( 1 == res ) { printf( "[%d/%d] My buddy %d is reachable\n", rank, size, myBuddyLeft ); } else { printf( "[%d/%d] My buddy %d is NOT reachable\n", rank, size, myBuddyLeft ); } res = shmem_pe_accessible( myBuddyRight ); if( 1 == res ) { printf( "[%d/%d] My buddy %d is reachable\n", rank, size, myBuddyRight ); } else { printf( "[%d/%d] My buddy %d is NOT reachable\n", rank, size, myBuddyRight ); }*/ /* Allocate some memory */ ptr = (char*)shmalloc( buffsize ); data = (char*) malloc( buffsize ); /* Put stuff there */ memset( data, rank, buffsize ); /* Remote read in my buddy's memory */ if( rank % 2 ) { myBuddy = myBuddyRight; } else { myBuddy = myBuddyLeft; } shmem_barrier_all(); if( getOnly != 1 ) { if( rank == 0 ) { if( NULL == fd ) { printf( "\n\n - * - * - * - * - PUT - * - * - * - * -\n" ); printf( "[SIZE (B)]\t[BANDWIDTH (Gb/s)]\t[AVERAGE TIME (nsec)]\t[STD DEV]\n" ); } else { fprintf( fd, "\n\n - * - * - * - * - PUT - * - * - * - * -\n" ); fprintf( fd, "[SIZE (B)]\t[BANDWIDTH (Gb/s)]\t[AVERAGE TIME (nsec)]\t[STD DEV]\n" ); } k = 1; it = 0; while( k <= buffsize ) { experimentPut( k, nb, myBuddy, data, ptr, fd ); k = ( 0x01 << it ) + 3; if( k <= buffsize ) { experimentPut( k, nb, myBuddy, data, ptr, fd ); } k = ( 0x01 << it ) - 3; if( k > 0 ) { experimentPut( k, nb, myBuddy, data, ptr, fd ); } it++; k = 0x01 << it; } } shmem_barrier_all(); } if( putOnly != 1 ) { if( rank == 1 ) { if( NULL == fd ) { printf( "\n\n - * - * - * - * - GET - * - * - * - * -\n" ); printf( "[SIZE (B)]\t[BANDWIDTH (Gb/s)]\t[AVERAGE TIME (nsec)]\t[STD DEV]\n" ); } else { fprintf( fd, "\n\n - * - * - * - * - GET - * - * - * - * -\n" ); fprintf( fd, "[SIZE (B)]\t[BANDWIDTH (Gb/s)]\t[AVERAGE TIME (nsec)]\t[STD DEV]\n" ); } k = 1; it = 0; while( k <= buffsize ) { experimentGet( k, nb, myBuddy, data, ptr, fd ); k = ( 0x01 << it ) + 3; if( k <= buffsize ) { experimentGet( k, nb, myBuddy, data, ptr, fd ); } k = ( 0x01 << it ) - 3; if( k > 0 ) { experimentGet( k, nb, myBuddy, data, ptr , fd); } it++; k = 0x01 << it; } } shmem_barrier_all(); } finish: if( NULL != filename ) { free( filename ); } if( NULL != fd ){ fclose( fd ); } #if PAPI if( NULL != events ) { free( events ); } if( NULL != values ) { free( values ); } #endif if( rank == 0 ) { printf( "The end %d\n", EXIT_SUCCESS ); } // return EXIT_SUCCESS; exit( 0 ) ; }
int measure(int (*f)(double *A, unsigned int n), double *A, unsigned int n, int event) { /* force program to run on a single CPU */ cpu_set_t my_set; /* Define your cpu_set bit mask. */ CPU_ZERO(&my_set); /* Initialize it all to 0, i.e. no CPUs selected. */ CPU_SET(0, &my_set); if (sched_setaffinity(0, sizeof(cpu_set_t), &my_set) != 0) perror("sched_setaffinity error"); /* init lib */ int events[4] = { PAPI_STL_ICY, //Cycles with no instruction issue PAPI_L2_DCM, //L2 data cache misses PAPI_L2_DCH, //L2 data cache hits PAPI_FP_OPS //Floating point operations }; char* event_names[4] = { "Cycles with no instruction issue: ", "L2 data cache misses: ", "L2 data cache hits: ", "Floating point operations: " }; long long values[1] = {0}; int eventSet = PAPI_NULL; int papi_err; bool papi_supported = true; if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { fprintf(stderr, "PAPI is unsupported.\n"); papi_supported = false; } if (PAPI_num_counters() < 4) { fprintf(stderr, "PAPI is unsupported.\n"); papi_supported = false; } if ((papi_err = PAPI_create_eventset(&eventSet)) != PAPI_OK) { fprintf(stderr, "Could not create event set: %s\n", PAPI_strerror(papi_err)); } if ((papi_err = PAPI_add_event(eventSet, events[event])) != PAPI_OK ) { fprintf(stderr, "Could not add event %d: %s\n", event, PAPI_strerror(papi_err)); } /* start counters */ if (papi_supported) { if ((papi_err = PAPI_start(eventSet)) != PAPI_OK) { fprintf(stderr, "Could not start counters: %s\n", PAPI_strerror(papi_err)); } } f(A, n); /* stop counters */ if (papi_supported) { if ((papi_err = PAPI_stop(eventSet, values)) != PAPI_OK) { fprintf(stderr, "Could not get values: %s\n", PAPI_strerror(papi_err)); } int save_to_file = 0; if (save_to_file > 0){ FILE *fp; char filename[60]; sprintf(filename, "results/event%d.txt", event); fp = fopen(filename, "w+"); if (fp == NULL) perror("Error while saving results to file"); fprintf(fp, "%lld", values[0]); fclose(fp); } printf("%s\t %lld\n", event_names[event], values[0]); } }
static int papi_internal_init(pmdaInterface *dp) { int ec; int sts; PAPI_event_info_t info; char entry[PAPI_HUGE_STR_LEN+12]; // the length papi uses for the symbol name unsigned int i = 0; pmID pmid; sts = sprintf(papi_version, "%d.%d.%d", PAPI_VERSION_MAJOR(PAPI_VERSION), PAPI_VERSION_MINOR(PAPI_VERSION), PAPI_VERSION_REVISION(PAPI_VERSION)); if (sts < 0) { __pmNotifyErr(LOG_ERR, "%s failed to create papi version metric.\n",pmProgname); return PM_ERR_GENERIC; } if ((sts = __pmNewPMNS(&papi_tree)) < 0) { __pmNotifyErr(LOG_ERR, "%s failed to create dynamic papi pmns: %s\n", pmProgname, pmErrStr(sts)); papi_tree = NULL; return PM_ERR_GENERIC; } number_of_counters = PAPI_num_counters(); if (number_of_counters < 0) { __pmNotifyErr(LOG_ERR, "hardware does not support performance counters\n"); return PM_ERR_APPVERSION; } else if (number_of_counters == 0) { __pmNotifyErr(LOG_WARNING, "no performance counters\n"); } sts = PAPI_library_init(PAPI_VER_CURRENT); if (sts != PAPI_VER_CURRENT) { __pmNotifyErr(LOG_ERR, "PAPI_library_init error (%d)\n", sts); return PM_ERR_GENERIC; } ec = PAPI_PRESET_MASK; PAPI_enum_event(&ec, PAPI_ENUM_FIRST); do { if (PAPI_get_event_info(ec, &info) == PAPI_OK) { if (info.count && PAPI_PRESET_ENUM_AVAIL) { expand_papi_info(i); memcpy(&papi_info[i].info, &info, sizeof(PAPI_event_info_t)); memcpy(&papi_info[i].papi_string_code, info.symbol + 5, strlen(info.symbol)-5); snprintf(entry, sizeof(entry),"papi.system.%s", papi_info[i].papi_string_code); pmid = pmid_build(dp->domain, CLUSTER_PAPI, i); papi_info[i].pmid = pmid; __pmAddPMNSNode(papi_tree, pmid, entry); memset(&entry[0], 0, sizeof(entry)); papi_info[i].position = -1; papi_info[i].metric_enabled = 0; expand_values(i); i++; } } } while(PAPI_enum_event(&ec, 0) == PAPI_OK); #if defined(HAVE_PAPI_DISABLED_COMP) char *tokenized_string; int number_of_components; int component_id; int native; number_of_components = PAPI_num_components(); native = 0 | PAPI_NATIVE_MASK; for (component_id = 0; component_id < number_of_components; component_id++) { const PAPI_component_info_t *component; component = PAPI_get_component_info(component_id); if (component->disabled || (strcmp("perf_event", component->name) && strcmp("perf_event_uncore", component->name))) continue; sts = PAPI_enum_cmp_event (&native, PAPI_ENUM_FIRST, component_id); if (sts == PAPI_OK) do { if (PAPI_get_event_info(native, &info) == PAPI_OK) { char local_native_metric_name[PAPI_HUGE_STR_LEN] = ""; int was_tokenized = 0; expand_papi_info(i); memcpy(&papi_info[i].info, &info, sizeof(PAPI_event_info_t)); tokenized_string = strtok(info.symbol, "::: -"); while (tokenized_string != NULL) { size_t remaining = sizeof(local_native_metric_name) - strlen(local_native_metric_name) - 1; if (remaining < 1) break; strncat(local_native_metric_name, tokenized_string, remaining); was_tokenized = 1; tokenized_string=strtok(NULL, "::: -"); if (tokenized_string) { remaining = sizeof(local_native_metric_name) - strlen(local_native_metric_name) - 1; if (remaining < 1) break; strncat(local_native_metric_name, ".", remaining); } } if (!was_tokenized) { strncpy(papi_info[i].papi_string_code, info.symbol, sizeof(papi_info[i].papi_string_code) - 1); } else { strncpy(papi_info[i].papi_string_code, local_native_metric_name, sizeof(papi_info[i].papi_string_code) - 1); } snprintf(entry, sizeof(entry),"papi.system.%s", papi_info[i].papi_string_code); pmid = pmid_build(dp->domain, CLUSTER_PAPI, i); papi_info[i].pmid = pmid; __pmAddPMNSNode(papi_tree, pmid, entry); memset(&entry[0], 0, sizeof(entry)); papi_info[i].position = -1; papi_info[i].metric_enabled = 0; expand_values(i); i++; } } while (PAPI_enum_cmp_event(&native, PAPI_ENUM_EVENTS, component_id) == PAPI_OK); } #endif pmdaTreeRebuildHash(papi_tree, number_of_events); /* Set one-time settings for all future EventSets. */ if ((sts = PAPI_set_domain(PAPI_DOM_ALL)) != PAPI_OK) { handle_papi_error(sts, 0); return PM_ERR_GENERIC; } if ((sts = PAPI_multiplex_init()) != PAPI_OK) { handle_papi_error(sts, 0); return PM_ERR_GENERIC; } sts = refresh_metrics(0); if (sts != PAPI_OK) return PM_ERR_GENERIC; return 0; }