int main(int ac, char **av) { int snappy_only = 0; if (av[1] && !strcmp(av[1], "-s")) { snappy_only = 1; av++; } #ifdef SIMPLE_PMU pin_cpu(NULL); if(perfmon_available() == 0) { printf("no perfmon support\n"); exit(1); } #endif while (*++av) { size_t size; char *map = mapfile(*av, O_RDONLY, &size); if (!map) err(*av); int i, v; for (i = 0; i < size; i += 4096) v = ((volatile char *)map)[i]; #ifdef COMP test_lz4(map, size, *av); #endif test_snappy(map, size, *av); if (snappy_only) goto unmap; #ifdef COMP test_lzo(map, size, *av); test_zlib(map, size, *av, 1); test_zlib(map, size, *av, 3); //test_zlib(map, size, *av, 5); test_lzf(map, size, *av); test_quicklz(map, size, *av); test_fastlz(map, size, *av); #endif #ifdef SNAPREF test_snapref(map, size, *av); #endif unmap: unmap_file(map, size); } return 0; }
void * driver1(void* arg) { int i,j,k, iter_count = 0; uint64_t line_count=0; size_t * write_pntr; write_pntr = array; // pin core affinity if(pin_cpu(pid, cpu_write) == -1) { err(1,"cannot set cpu write affinity"); } else{ printf(" write thread pinned to core %d to run\n",cpu_write); } fprintf(stderr,"from writer, total_lines = %ld, shared = %d\n", total_lines,shared); while(line_count < total_lines) { i = 0; while(exchange_flag == 1) { i++; } if(shared == 0) { // if(iter_count < 10)fprintf(stderr,"writer calling write kernel\n"); write_pntr = write_buf(seg_size, write_pntr); // if(iter_count < 10)fprintf(stderr,"writer returned from write kernel\n"); } else { // if(iter_count < 10)fprintf(stderr,"writer calling read kernel\n"); write_pntr = read_buf(seg_size, write_pntr); // if(iter_count < 10)fprintf(stderr,"writer returned from read kernel\n"); } line_count += seg_size; iter_count++; exchange_flag = 1; } pthread_exit(NULL); }
void * driver0(void * arg) { int i,j,k, iter_count =0; uint64_t line_count=0, init_tsc, end_tsc; size_t * read_pntr; read_pntr = array; // pin core affinity if(pin_cpu(pid, cpu_read) == -1) { err(1,"cannot set cpu read affinity"); } else{ printf(" read thread pinned to core %d to run\n",cpu_read); } fprintf(stderr,"total_lines = %ld\n",total_lines); read_sum_tsc = 0; while(line_count < total_lines) { i = 0; while(exchange_flag == 0) { i++; } init_tsc = _rdtsc(); // if(iter_count < 10)fprintf(stderr,"reader calling kernel\n"); read_pntr = read_buf(seg_size, read_pntr); // if(iter_count < 10)fprintf(stderr,"reader returned from kernel\n"); end_tsc = _rdtsc(); read_sum_tsc += (end_tsc - init_tsc); line_count += seg_size; iter_count++; exchange_flag = 0; } fprintf(stderr," from read thread, line_count = %ld, TSC sum = %lu, latency = %g\n", line_count, read_sum_tsc,(double)read_sum_tsc/(double)line_count); pthread_exit(NULL); }
static void measure(void) { perf_event_desc_t *fds = NULL; int num_fds = 0; uint64_t values[3]; ssize_t n; int i, ret; int pr[2], pw[2]; pid_t pid; char cc = '0'; ret = pfm_initialize(); if (ret != PFM_SUCCESS) err(1, "cannot initialize libpfm"); if (options.cpu == -1) { srandom(getpid()); options.cpu = random() % sysconf(_SC_NPROCESSORS_ONLN); } ret = pipe(pr); if (ret) err(1, "cannot create read pipe"); ret = pipe(pw); if (ret) err(1, "cannot create write pipe"); ret = perf_setup_list_events(options.events, &fds, &num_fds); if (ret || !num_fds) exit(1); for(i=0; i < num_fds; i++) { fds[i].hw.disabled = 1; fds[i].hw.read_format = PERF_FORMAT_SCALE; fds[i].fd = perf_event_open(&fds[i].hw, 0, -1, -1, 0); if (fds[i].fd == -1) err(1, "cannot open event %d", i); } /* * Pin to CPU0, inherited by child process. That will enforce * the ping-pionging and thus stress the PMU context switch * which is what we want */ ret = pin_cpu(getpid(), options.cpu); if (ret) err(1, "cannot pin to CPU%d", options.cpu); printf("Both processes pinned to CPU%d, running for %d seconds\n", options.cpu, options.delay); /* * create second process which is not monitoring at the moment */ switch(pid=fork()) { case -1: err(1, "cannot create child\n"); case 0: /* do not inherit session fd */ for(i=0; i < num_fds; i++) close(fds[i].fd); /* pr[]: write master, read child */ /* pw[]: read master, write child */ close(pr[1]); close(pw[0]); do_child(pr[0], pw[1]); exit(1); } close(pr[0]); close(pw[1]); /* * Let's roll now */ prctl(PR_TASK_PERF_EVENTS_ENABLE); signal(SIGALRM, sig_handler); alarm(options.delay); /* * ping pong loop */ while(!quit) { n = write(pr[1], "c", 1); if (n < 1) err(1, "write failed"); n = read(pw[0], &cc, 1); if (n < 1) err(1, "read failed"); } prctl(PR_TASK_PERF_EVENTS_DISABLE); for(i=0; i < num_fds; i++) { uint64_t val; double ratio; ret = read(fds[i].fd, values, sizeof(values)); if (ret == -1) err(1,"pfm_read error"); if (ret != sizeof(values)) errx(1, "did not read correct amount %d", ret); val = perf_scale(values); ratio = perf_scale_ratio(values); if (ratio == 1.0) printf("%20"PRIu64" %s\n", val, fds[i].name); else if (ratio == 0.0) printf("%20"PRIu64" %s (did not run: competing session)\n", val, fds[i].name); else printf("%20"PRIu64" %s (scaled from %.2f%% of time)\n", val, fds[i].name, ratio*100.0); } /* * kill child process */ kill(SIGKILL, pid); /* * close pipes */ close(pr[1]); close(pw[0]); /* * and destroy our session */ for(i=0; i < num_fds; i++) close(fds[i].fd); perf_free_fds(fds, num_fds); /* free libpfm resources cleanly */ pfm_terminate(); }
static void pfms_thread_mainloop(void *arg) { long k = (long )arg; uint32_t mycpu = (uint32_t)k; pfarg_ctx_t myctx, *ctx; pfarg_load_t load_args; int fd = -1; pfms_thread_t *td; sem_t *cmd_sem; int ret = 0; memset(&load_args, 0, sizeof(load_args)); load_args.load_pid = mycpu; td = tds+mycpu; ret = pin_cpu(mycpu); dprint("CPU%u wthread created and pinned ret=%d\n", mycpu, ret); cmd_sem = &tds[mycpu].cmd_sem; for(;;) { dprint("CPU%u waiting for cmd\n", mycpu); sem_wait(cmd_sem); switch(td->cmd) { case CMD_NONE: ret = 0; break; case CMD_CTX: /* * copy context to get private fd */ ctx = td->data; myctx = *ctx; fd = pfm_create_context(&myctx, NULL, NULL, 0); ret = fd < 0 ? -1 : 0; dprint("CPU%u CMD_CTX ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_LOAD: ret = pfm_load_context(fd, &load_args); dprint("CPU%u CMD_LOAD ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_UNLOAD: ret = pfm_unload_context(fd); dprint("CPU%u CMD_UNLOAD ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_START: ret = pfm_start(fd, NULL); dprint("CPU%u CMD_START ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_STOP: ret = pfm_stop(fd); dprint("CPU%u CMD_STOP ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_WPMCS: ret = pfm_write_pmcs(fd,(pfarg_pmc_t *)td->data, td->ndata); dprint("CPU%u CMD_WPMCS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_WPMDS: ret = pfm_write_pmds(fd,(pfarg_pmd_t *)td->data, td->ndata); dprint("CPU%u CMD_WPMDS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_RPMDS: ret = pfm_read_pmds(fd,(pfarg_pmd_t *)td->data, td->ndata); dprint("CPU%u CMD_RPMDS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd); break; case CMD_CLOSE: dprint("CPU%u CMD_CLOSE fd=%d\n", mycpu, fd); ret = close(fd); fd = -1; break; default: break; } td->ret = ret; dprint("CPU%u td->ret=%d\n", mycpu, ret); barrier_wait(td->barrier); } }
int main(int argc, char ** argv) { char * buf1; void * ret; size_t * array, ret_val = 0; size_t array_stride; int i,j,k,cpu,cpu_run,line_count,stride, fd = -1; off_t offset = 0; int len=10240000, iter=100,mult=1,main_ret=0; double iterations; double *a, *b; size_t start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; __pid_t pid=0; size_t buf_size,jj,zero_loop, buf_by_num_seg,ind; size_t num_pages, page_size, var_size; int cpu_setsize; cpu_set_t mask; // size_t pattern[] = {4,1,5,2,6,3,7,0}; int *pattern; int step, c; int* index, lc_by_num_seg,count, num_seg=32, huge=0; unsigned int bitmask, *intstar; page_size = 4096; // process input arguments if(argc < 6){ fprintf(stderr,"the random walker requires at least 6 arguments (only the 7th in the list below is optional), there were %d\n",argc); usage(); err(1,"insufficient invocation arguments"); } while ((c = getopt(argc, argv, "i:r:l:s:S:m:L")) != -1) { switch(c) { case 'i': cpu = atoi(optarg); break; case 'r': cpu_run = atoi(optarg); break; case 'l': line_count = atoi(optarg); break; case 's': stride = atoi(optarg); break; case 'S': num_seg = atoi(optarg); break; case 'm': mult = atoi(optarg); break; case 'L': huge=1; page_size = 2 * 1024 * 1024; break; default: err(1, "unknown option %c", c); } } iter = iter*mult; var_size = sizeof(size_t); fprintf(stderr, "size_t in %zd bytes\n",var_size); // pin core affinity if(pin_cpu(pid, cpu) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d\n",cpu); } pattern = (int*) malloc(num_seg*sizeof(int)); if(pattern == NULL) { fprintf(stderr," failed to malloc pattern for size = %d\n",num_seg); err(1,"malloc of pattern failed"); } // calculate stride and buffer size stride = page_size*stride + 64; buf_size = (size_t)line_count*(size_t)stride; num_pages = buf_size/page_size + 2; buf_size = page_size*num_pages; array_stride = stride/sizeof(double); iterations = (double)iter*(double)len; // create index array for "random" patterna index = (int*)malloc(line_count*sizeof(int)); if(index == NULL) { fprintf(stderr," failed to malloc index array for line_count of %d\n",line_count); err(1,"failed to malloc index"); } if(num_seg == 1) { for(i=0; i<line_count-1; i++)index[i] = i; } else { // fprintf(stderr," calling rndm_list, n = %d\n",num_seg); rndm_list(pattern,num_seg); lc_by_num_seg = line_count/num_seg; if(lc_by_num_seg*num_seg != line_count) { fprintf(stderr," line count must be a multiple of the fifth argument num_seg = %d\n", num_seg); err(1," bad line_count"); } count=0; buf_by_num_seg = buf_size/num_seg; for(i=0; i<lc_by_num_seg; i++) { step = 0; for(j=0;j<num_seg;j++) { count++; if(j == (num_seg-1) ) step = 1; ind = lc_by_num_seg*pattern[j]; index[count]= (int) ind + i + step; if(index[count] >= line_count) printf(" count = %d, index = %d\n",count,index[count]); } } } index[0] = 0; for(i=0; i<line_count; i++)index[i] = index[i]*array_stride; // malloc and initialize buffers // replace malloc call with a call to mmap if(huge == 0) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset); if(huge == 1) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset); if(buf1 == MAP_FAILED) { fprintf(stderr,"mmap failed\n"); err(1,"mmap failed"); } fprintf(stderr," buf1 for a = %p\n",buf1); a = (double*) buf1; if(huge == 0) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset); if(huge == 1) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset); if(buf1 == MAP_FAILED) { fprintf(stderr,"mmap failed\n"); err(1,"mmap failed"); } fprintf(stderr," buf1 for b = %p\n",buf1); b = (double*)buf1; zero_loop = buf_size/sizeof(double); fprintf(stderr, " buf_size = %zu, zero_loop = %zu, array_stride = %zd\n",buf_size,zero_loop,array_stride); for(i=0; i<zero_loop; i++) a[i] = 0; for(i=0; i<zero_loop; i++) b[i] = 0; fprintf(stderr," finished zeroing buf for a, b\n"); // pin core affinity if(pin_cpu(pid, cpu_run) == -1) { err(1,"cannot set cpu run affinity"); } else{ printf(" process pinned to core %d to run\n",cpu_run); } // run the walker printf(" calling walker %d times which loops %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size); call_start = _rdtsc(); for(i=0;i<iter;i++){ start = _rdtsc(); ret_val = reader(len,line_count,a,b,index); // fprintf(stderr, " retval = %ld\n",ret_val); stop = _rdtsc(); run_time = stop - start; } call_stop = _rdtsc(); call_run_time = call_stop - call_start; printf(" run time = %zd\n",call_run_time); // printout printf(" average cycles per iteration = %f\n", (double)call_run_time/iterations); return main_ret; }
int main(int argc, char ** argv) { char * buf1; void * ret; size_t ret_val = 0; size_t array_stride; int rc0, rc1; int i,j,k, line_count=0,stride=0, fd = -1; off_t offset = 0; int len=10240000, iter=10,mult=1,main_ret=0; double iterations; size_t start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; size_t buf_size,jj,zero_loop, buf_by_num_seg,ind; size_t num_pages, page_size, var_size; int cpu_setsize; cpu_set_t mask; // size_t pattern[] = {4,1,5,2,6,3,7,0}; int *pattern; int step, c; int* index, lc_by_num_seg,count, num_seg=32, huge=0; unsigned int bitmask, *intstar; void *arg; pthread_t * Thread_dat; page_size = 4096; shared = 0; // process input arguments if(argc < 6){ fprintf(stderr,"the random walker requires at least 6 arguments (only the 7th in the list below is optional), there were %d\n",argc); usage(); err(1,"insufficient invocation arguments"); } while ((c = getopt(argc, argv, "i:r:w:l:S:m:L:sh")) != -1) { switch(c) { case 'i': cpu = atoi(optarg); break; case 'r': cpu_read = atoi(optarg); break; case 'w': cpu_write = atoi(optarg); break; case 'l': seg_size = atoi(optarg); break; case 's': shared = 1; break; case 'S': num_seg = atoi(optarg); break; case 'm': mult = atoi(optarg); break; case 'L': line_count = atoi(optarg); break; case 'h': usage(); exit(1); default: err(1, "unknown option %c", c); } } iter = iter*mult; total_lines = len*iter; fprintf(stderr," seg_size = %d, line_count = %d, total_lines = %ld\n", seg_size, line_count, total_lines); var_size = sizeof(size_t); fprintf(stderr, "size_t in %zd bytes\n",var_size); Thread_dat = (pthread_t *) malloc(MAX_THREAD*sizeof(pthread_t)); if(Thread_dat == NULL) err(1, "malloc of Thread failed"); // pin core affinity if(pin_cpu(pid, cpu) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d\n",cpu); } pattern = (int*) malloc(num_seg*sizeof(int)); if(pattern == NULL) { fprintf(stderr," failed to malloc pattern for size = %d\n",num_seg); err(1,"malloc of pattern failed"); } // calculate stride and buffer size stride = page_size*stride + 64; buf_size = (size_t)line_count*(size_t)stride; num_pages = buf_size/page_size + 2; buf_size = page_size*num_pages; array_stride = stride/sizeof(size_t *); iterations = (double)iter*(double)len; // create index array for "random" patterna index = (int*)malloc(line_count*sizeof(int)); if(index == NULL) { fprintf(stderr," failed to malloc index array for line_count of %d\n",line_count); err(1,"failed to malloc index"); } if(num_seg == 1) { for(i=0; i<line_count; i++)index[i] = i; } else { // fprintf(stderr," calling rndm_list, n = %d\n",num_seg); rndm_list(pattern,num_seg); lc_by_num_seg = line_count/num_seg; if(lc_by_num_seg*num_seg != line_count) { fprintf(stderr," line count must be a multiple of the fifth argument num_seg = %d\n", num_seg); err(1," bad line_count"); } count=0; buf_by_num_seg = buf_size/num_seg; for(i=0; i<lc_by_num_seg; i++) { step = i*num_seg; for(j=0;j<num_seg;j++) { count++; ind = lc_by_num_seg*pattern[j]; index[count]= (int) pattern[j] + step; if(index[count] >= line_count) printf(" count = %d, index = %d\n",count,index[count]); } count++; index[count]=(i+1)*num_seg; } index[count] = 0; } index[0] = 0; // test index map for every value between 0 and line_count-1 showing up once for(i=0;i<line_count;i++) { if(index[i] > line_count)err(1,"index[%d] = %d",i,index[i]); // if(i < 128 )fprintf(stderr,"index[%d] = %d\n",i,index[i]); // if(i > line_count - 128 )fprintf(stderr,"index[%d] = %d\n",i,index[i]); index_test[index[i]]++; } bad = 0; for(i=0;i<line_count;i++) { if(index_test[i] != 1) { bad++; if(bad < 64)fprintf(stderr,"index_test[%d] = %d\n",i,index_test[i]); } } fprintf(stderr,"bad = %d\n",bad); // malloc and initialize buffers buf1 = (char *)malloc(buf_size + 4096 ); // replace malloc call with a call to mmap /* if(huge == 0) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset); if(huge == 1) buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset); if(buf1 == MAP_FAILED) { fprintf(stderr,"mmap failed\n"); err(1,"mmap failed"); } */ fprintf(stderr," buf1 = %p\n",buf1); // buf1 = buf1 + (0x1000 - (size_t)buf1 & 0xFFF) ; // fprintf(stderr," buf1 = %p\n",buf1); zero_loop = buf_size/(size_t)var_size; fprintf(stderr, " buf_size = %zu, zero_loop = %zu, array_stride = %zd\n",buf_size,zero_loop,array_stride); // for(i=0;i<buf_size;i++)buf1[i]=0; //touch every page to ensure creation array = (size_t *) buf1; // for(i=0; i<zero_loop; i++) array[i] = 0; ret = memset(buf1, 0, (size_t)buf_size); fprintf(stderr," finished zeroing buf ret = %p\n",ret); // for(jj=0;jj<line_count-1; jj++)array[jj*(size_t)array_stride] = (size_t) &array[(size_t)array_stride*(jj+1)]; for(jj=0;jj<line_count-1;jj++)array[index[jj]*array_stride] = (size_t)&array[index[jj+1]*array_stride]; fprintf(stderr," target of last element in loop = %zx\n",(size_t)(array[line_count-1]-(size_t)buf1)); array[(size_t)array_stride*index[line_count-1]] = (size_t)&array[0]; // for(jj=0; jj< line_count; jj+=8) printf(", jj = %d, array[jj]-&array[0]/array_stride = %d\n",jj,(array[jj]-(size_t)&array[0])/array_stride); // run the walker printf(" invoking reader %d times which loops %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size); exchange_flag = 0; rc0 = pthread_create(&Thread_dat[0], NULL, driver0, (void *)arg); if(rc0) err(1,"failed to start thread for driver0"); printf(" invoking writer %d times which loops %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size); rc1 = pthread_create(&Thread_dat[1], NULL, driver1, (void *)arg); if(rc1) err(1,"failed to start thread for driver1"); printf(" run time = %zd\n",call_run_time); // printout printf(" average cycles per iteration = %f\n", (double)call_run_time/iterations); pthread_exit(NULL); return main_ret; }
void main(int argc, char ** argv) { double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0; char * buf1, *buf2, *buf3; int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=100, c_val; int len,mem_level, level_size[4], cpu, cpu_run, bytes_per,scale; unsigned long long start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; __pid_t pid=0; int cpu_setsize; cpu_set_t mask; // process input arguments if(argc < 3 ){ printf("triad driver needs at least 3 arguments, cpu_init, cpu_run, cache_level, [call count multiplier def = 1], [offset a, offset_b, offset_c defaults = 0] \n"); printf(" argc = %d\n",argc); usage(); err(1, "bad arguments"); } len = L4; while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) { switch(c_val) { case 'i': cpu = atoi(optarg); break; case 'r': cpu_run = atoi(optarg); break; case 'l': mem_level = atoi(optarg); break; case 'm': mult = atoi(optarg); break; case 'a': offset_a = atoi(optarg); break; case 'b': offset_b = atoi(optarg); break; case 'c': offset_c = atoi(optarg); break; default: err(1, "unknown option %c", c_val); } } iter = iter*mult; // pin core affinity for initialization if(pin_cpu(pid, cpu) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for initialization\n",cpu); } // set buffer sizes and loop tripcounts based on memory level level_size[0]=L1; level_size[1]=L2; level_size[2]=L3; level_size[3]=L4; fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult); len = level_size[mem_level]/32; fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult); scale = level_size[3]/(32*len); fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d, scale = %d\n",len, mem_level, iter,mult,scale); iter =iter*scale*mult; fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult); // malloc and initialize buffers buf1 = malloc(sizeof(double)*len + 4096 + 1024); fprintf(stderr," buf1 = %p\n",buf1); buf1 = buf1 + (0x1000 - (unsigned int)buf1 & 0xFFF) + offset_a; fprintf(stderr," buf1 = %p\n",buf1); a = (double *) buf1; buf2 = malloc(sizeof(double)*len + 4096 + 1024); fprintf(stderr," buf2 = %p\n",buf2); buf2 = buf2 + (0x1000 - (unsigned int)buf2 & 0xFFF) + offset_b; fprintf(stderr," buf2 = %p\n",buf2); b = (double *) buf2; buf3 = malloc(sizeof(double)*len + 4096 + 1024); fprintf(stderr," buf3 = %p\n",buf3); buf3 = buf3 + (0x1000 - (unsigned int)buf3 & 0xFFF) + offset_c; fprintf(stderr," buf3 = %p\n",buf3); c = (double *) buf3; for(i=0;i<len;i++){ a[i] = 0.; b[i] = 10.; c[i] = 10.; } // pin core affinity for triad run if(pin_cpu(pid, cpu_run) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for triad run\n",cpu_run); } // run the triad printf(" calling triad %d times with len = %d\n",iter,len); call_start = _rdtsc(); for(i=0;i<iter;i++){ start = _rdtsc(); bytes_per = triad(len,xx,a,b,c); stop = _rdtsc(); run_time = stop - start; xx+=0.01; total_bytes +=len*bytes_per; bw=(double)(len*bytes_per)/(double)run_time; if(bw > best_bw) best_bw = bw; } call_stop = _rdtsc(); call_run_time = call_stop - call_start; avg_bw=(double)(total_bytes)/(double)call_run_time; // printout printf(" transfering %lld bytes from memory level %d took %lld cycles/call and a total of %lld\n",total_bytes,mem_level,run_time,call_run_time); printf(" average bytes/cycle = %f\n", avg_bw); printf(" best bytes/cycle = %f\n",best_bw); }
void PAPI_HW_COUNTER_open(int tid){ // set events to measure int *Events; int EventCode; int event_ctr = 0; int retval; #ifdef MEASURE_TIME #endif #ifdef MEASURE_CPI thr_vars[tid].papi_idx_inst = thr_vars[tid].num_events++; thr_vars[tid].papi_idx_cyc = thr_vars[tid].num_events++; #endif #ifdef MEASURE_MEMACC thr_vars[tid].papi_idx_load = thr_vars[tid].num_events++; thr_vars[tid].papi_idx_store = thr_vars[tid].num_events++; #endif #ifdef MEASURE_LLCMISS thr_vars[tid].papi_idx_llcmiss = thr_vars[tid].num_events++; #endif #ifdef MEASURE_ICACHEMISS thr_vars[tid].papi_idx_icachemiss = thr_vars[tid].num_events++; #endif #ifdef MEASURE_DCACHEMISS thr_vars[tid].papi_idx_l1dcm = thr_vars[tid].num_events++; thr_vars[tid].papi_idx_l1dca = thr_vars[tid].num_events++; #endif #ifdef MEASURE_ENERGY #endif event_ctr = 0; // reset event counter if((Events=(int*)malloc(sizeof(int)*thr_vars[tid].num_events)) == NULL){ printf("ERROR: Failed to allocate memory for Events."); } if((thr_vars[tid].values=(long long int*)malloc(sizeof(long long)*thr_vars[tid].num_events)) == NULL){ printf("ERROR: Failed to allocate memory for Events."); } #ifdef __ARM_ARCH_7A__ // pin processor only on arm arch. pid_t pid = getpid(); int core = 0; printf("Pinning thread %d to cores %d..%d\n", pid, 0, 0); printf("Observe in terminal via \"ps -p <PID> -L -o pid,tid,psr\"\n"); pin_cpu(pid, core); printf("Pinned to core %d\n", core); #endif // Open file to output char filename_id[2*sizeof(int)]; snprintf(filename_id, sizeof(filename_id),"%d",tid); char* filename_w_id; filename_w_id=(char*)malloc(strlen(OUTFILEID)+strlen(OUTFILEEXT)+strlen(filename_id)+1); strcpy(filename_w_id, OUTFILEID); strcat(filename_w_id, filename_id); strcat(filename_w_id, OUTFILEEXT); thr_vars[tid].f=fopen(filename_w_id, "w"); if (thr_vars[tid].f == NULL){ printf("failed to open file %s.\n", filename_w_id); exit(1); } // Measure clock frequency long long elapsed_cyc; elapsed_cyc = PAPI_get_real_cyc(); sleep(1); elapsed_cyc = PAPI_get_real_cyc()-elapsed_cyc; thr_vars[tid].PAPI_CLOCK_RATE = elapsed_cyc; printf("Measured clock frequency: %.0lld Hz\n",thr_vars[tid].PAPI_CLOCK_RATE); // Set EventSet thr_vars[tid].EventSet = PAPI_NULL;/*EventSet*/ retval=PAPI_create_eventset(&(thr_vars[tid].EventSet)); if (retval != PAPI_OK){ papi_fail(__FILE__, __LINE__, "PAPI_create_eventset()", retval); } #ifdef MEASURE_TIME #endif #ifdef MEASURE_CPI retval = PAPI_event_name_to_code( PAPI_INST , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, inst", retval); } Events[event_ctr++] = EventCode; retval = PAPI_event_name_to_code( PAPI_CYC , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, cyc", retval); } Events[event_ctr++] = EventCode; #endif #ifdef MEASURE_MEMACC retval = PAPI_event_name_to_code( PAPI_MEM_LOAD , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, loads", retval); } Events[event_ctr++] = EventCode; retval=PAPI_event_name_to_code( PAPI_MEM_STORE , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, stores", retval); } Events[event_ctr++] = EventCode; #endif #ifdef MEASURE_LLCMISS retval = PAPI_event_name_to_code( PAPI_LLC_MISS , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, llc miss", retval); } Events[event_ctr++] = EventCode; #endif #ifdef MEASURE_ICACHEMISS retval = PAPI_event_name_to_code( PAPI_IC_MISS , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, llc miss", retval); } Events[event_ctr++] = EventCode; #endif #ifdef MEASURE_DCACHEMISS retval = PAPI_event_name_to_code( PAPI_L1_DC_MISS , &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, Level 1 data cache misses", retval); } Events[event_ctr++] = EventCode; retval = PAPI_event_name_to_code( PAPI_L1_DC_ACCESS, &EventCode ); if (retval != PAPI_OK ) { papi_fail(__FILE__, __LINE__, "PAPI_event_name_to_code, Level 1 data cache accesses", retval); } Events[event_ctr++] = EventCode; #endif #ifdef MEASURE_ENERGY printf("Probing all RAPL events\n"); thr_vars[tid].numcmp = PAPI_num_components(); for(thr_vars[tid].cid=0; thr_vars[tid].cid<thr_vars[tid].numcmp; thr_vars[tid].cid++) { if ( (thr_vars[tid].cmpinfo = PAPI_get_component_info(thr_vars[tid].cid)) == NULL) { papi_fail(__FILE__, __LINE__,"PAPI_get_component_info failed\n", 0); } if (strstr(thr_vars[tid].cmpinfo->name,"rapl")) { thr_vars[tid].rapl_cid=thr_vars[tid].cid; printf("Found rapl component at cid %d.\n",thr_vars[tid].rapl_cid); if (thr_vars[tid].cmpinfo->disabled) { printf("RAPL component disabled: %s\n", thr_vars[tid].cmpinfo->disabled_reason); exit(EXIT_FAILURE); } break; } } if (thr_vars[tid].cid==thr_vars[tid].numcmp) { // Component not found: papi_fail(__FILE__,__LINE__,"No rapl component found\n",0); } retval = PAPI_create_eventset( &(thr_vars[tid].EnergyEventSet) ); if (retval != PAPI_OK){ papi_fail(__FILE__,__LINE__, "PAPI_create_eventset()", retval); } // Add all events: int r; thr_vars[tid].code = PAPI_NATIVE_MASK; r = PAPI_enum_cmp_event( &(thr_vars[tid].code), PAPI_ENUM_FIRST, thr_vars[tid].rapl_cid ); while ( r == PAPI_OK ) { retval = PAPI_event_code_to_name( thr_vars[tid].code, thr_vars[tid].event_names[thr_vars[tid].num_energy_events] ); if ( retval != PAPI_OK ) { printf("Error translating %#x\n",thr_vars[tid].code); papi_fail(__FILE__, __LINE__, "PAPI_event_code_to_name", retval ); } printf("Found event: %s\n", thr_vars[tid].event_names[thr_vars[tid].num_energy_events]); retval = PAPI_get_event_info(thr_vars[tid].code,&(thr_vars[tid].evinfo)); if (retval != PAPI_OK) { papi_fail(__FILE__, __LINE__, "Error getting event info\n",retval); } strncpy(thr_vars[tid].units[thr_vars[tid].num_energy_events],thr_vars[tid].evinfo.units,PAPI_MIN_STR_LEN); thr_vars[tid].data_type[thr_vars[tid].num_energy_events] = thr_vars[tid].evinfo.data_type; retval = PAPI_add_event(thr_vars[tid].EnergyEventSet, thr_vars[tid].code); if (retval != PAPI_OK ) { papi_fail( __FILE__, __LINE__, "PAPI_add_event()", retval); } r = PAPI_enum_cmp_event( &(thr_vars[tid].code), PAPI_ENUM_EVENTS, thr_vars[tid].rapl_cid ); thr_vars[tid].num_energy_events++; } if((thr_vars[tid].energy_values=(long long int*)malloc(sizeof(long long)*thr_vars[tid].num_energy_events)) == NULL){ printf("ERROR: Failed to allocate memory for Events."); } #endif #ifdef MEASURE_HW_COUNTER int k; for(k = 0; k < thr_vars[tid].num_events; k++){ retval = PAPI_add_event(thr_vars[tid].EventSet, Events[k]); if (retval != PAPI_OK ) { printf("At event %d:\n",k); papi_fail( __FILE__, __LINE__, "PAPI_add_event()", retval); } } retval=PAPI_start(thr_vars[tid].EventSet); if (retval != PAPI_OK){ papi_fail(__FILE__, __LINE__, "PAPI_start()", retval); } #endif #ifdef MEASURE_ENERGY retval=PAPI_start(thr_vars[tid].EnergyEventSet); if (retval != PAPI_OK){ papi_fail(__FILE__, __LINE__, "PAPI_start() on energy", retval); } #endif }
void main(int argc, char ** argv) { double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0; char * buf1, *buf2, *buf3; int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=1000, c_val; int len,num_pages, num_lines, cpu_run,scale; u64 start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0; __pid_t pid=0; int cpu_setsize; cpu_set_t mask; int *buff; size_t buf_size; off_t offset = 0; int fd = -1; // process input arguments if(argc < 3 ){ printf("affinity needs 2 arguments, cpu_run, call count multiplier def = 1\n"); printf(" argc = %d\n",argc); usage(); err(1, "bad arguments"); } while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) { switch(c_val) { case 'r': cpu_run = atoi(optarg); break; case 'm': mult = atoi(optarg); break; default: err(1, "unknown option %c", c_val); } } // pin core affinity for initialization if(pin_cpu(pid, cpu_run) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for triad run\n",cpu_run); } // set buffer sizes and loop tripcount buf_size = (u64)4096*(u64)num_pages; num_lines=64*num_pages; iter = iter*mult; // malloc and initialize buffers printf(" starting malloc loop of %d iterations with buf_size = %ld, num_lines = %d\n",iter,buf_size, num_lines); call_start = _rdtsc(); for(i=0;i<iter;i++){ start = _rdtsc(); if(pin_cpu(pid, cpu_run) == -1) { err(1,"failed to set affinity"); } else{ fprintf(stderr," process pinned to core %d for triad run\n",cpu_run); } stop = _rdtsc(); run_time = stop - start; } call_stop = _rdtsc(); call_run_time = call_stop - call_start; // printout printf(" allocating %lld bytes and initializing and freeing took %lld cycles\n",(u64)len*(u64)iter,run_time); }
int main(int argc, char **argv) { pfmlib_input_param_t inp; pfmlib_output_param_t outp; pfmlib_core_input_param_t mod_inp; pfmlib_options_t pfmlib_options; pfarg_pmr_t pc[NUM_PMCS]; pfarg_pmd_attr_t pd[NUM_PMDS]; pfarg_sinfo_t sif; struct pollfd fds; smpl_arg_t buf_arg; pfarg_msg_t msg; smpl_hdr_t *hdr; void *buf_addr; uint64_t pebs_size; pid_t pid; int ret, fd, type; unsigned int i; uint32_t ctx_flags; if (argc < 2) fatal_error("you need to pass a program to sample\n"); if (pfm_initialize() != PFMLIB_SUCCESS) fatal_error("libpfm intialization failed\n"); /* * check we are on an Intel Core PMU */ pfm_get_pmu_type(&type); if (type != PFMLIB_INTEL_CORE_PMU && type != PFMLIB_INTEL_ATOM_PMU) fatal_error("This program only works with an Intel Core processor\n"); /* * pass options to library (optional) */ memset(&pfmlib_options, 0, sizeof(pfmlib_options)); pfmlib_options.pfm_debug = 0; /* set to 1 for debug */ pfmlib_options.pfm_verbose = 1; /* set to 1 for verbose */ pfm_set_options(&pfmlib_options); memset(pd, 0, sizeof(pd)); memset(pc, 0, sizeof(pc)); memset(&inp, 0, sizeof(inp)); memset(&outp, 0, sizeof(outp)); memset(&mod_inp, 0, sizeof(mod_inp)); memset(&sif, 0, sizeof(sif)); memset(&buf_arg, 0, sizeof(buf_arg)); memset(&fds, 0, sizeof(fds)); /* * search for our sampling event */ if (pfm_find_full_event(SMPL_EVENT, &inp.pfp_events[0]) != PFMLIB_SUCCESS) fatal_error("cannot find sampling event %s\n", SMPL_EVENT); inp.pfp_event_count = 1; inp.pfp_dfl_plm = PFM_PLM3; /* * important: inform libpfm we do use PEBS */ mod_inp.pfp_core_pebs.pebs_used = 1; /* * sampling buffer parameters */ pebs_size = 3 * getpagesize(); buf_arg.buf_size = pebs_size; /* * sampling period cannot use more bits than HW counter can supoprt */ buf_arg.cnt_reset = -SMPL_PERIOD; /* * We want a system-wide context for sampling */ ctx_flags = PFM_FL_SYSTEM_WIDE | PFM_FL_SMPL_FMT; /* * trigger notification (interrupt) when reaching the very end of * the buffer */ buf_arg.intr_thres = (pebs_size/sizeof(smpl_entry_t))*90/100; /* * we want to measure CPU0, thus we pin ourself to the CPU before invoking * perfmon. This ensures that the sampling buffer will be allocated on the * same NUMA node. */ ret = pin_cpu(getpid(), 0); if (ret) fatal_error("cannot pin on CPU0"); /* * create session and sampling buffer */ fd = pfm_create(ctx_flags, &sif, FMT_NAME, &buf_arg, sizeof(buf_arg)); if (fd == -1) { if (errno == ENOSYS) { fatal_error("Your kernel does not have performance monitoring support!\n"); } fatal_error("cannot create session %s, maybe you do not have the PEBS sampling format in the kernel.\nCheck /sys/kernel/perfmon/formats\n", strerror(errno)); } /* * map buffer into our address space */ buf_addr = mmap(NULL, (size_t)buf_arg.buf_size, PROT_READ, MAP_PRIVATE, fd, 0); printf("session [%d] buffer mapped @%p\n", fd, buf_addr); if (buf_addr == MAP_FAILED) fatal_error("cannot mmap sampling buffer errno %d\n", errno); hdr = (smpl_hdr_t *)buf_addr; printf("pebs_base=0x%llx pebs_end=0x%llx index=0x%llx\n" "intr=0x%llx version=%u.%u\n" "entry_size=%zu ds_size=%zu\n", (unsigned long long)hdr->ds.pebs_buf_base, (unsigned long long)hdr->ds.pebs_abs_max, (unsigned long long)hdr->ds.pebs_index, (unsigned long long)hdr->ds.pebs_intr_thres, PFM_VERSION_MAJOR(hdr->version), PFM_VERSION_MINOR(hdr->version), sizeof(smpl_entry_t), sizeof(hdr->ds)); if (PFM_VERSION_MAJOR(hdr->version) < 1) fatal_error("invalid buffer format version\n"); /* * get which PMC registers are available */ detect_unavail_pmu_regs(&sif, &inp.pfp_unavail_pmcs, NULL); /* * let libpfm figure out how to assign event onto PMU registers */ if (pfm_dispatch_events(&inp, &mod_inp, &outp, NULL) != PFMLIB_SUCCESS) fatal_error("cannot assign event %s\n", SMPL_EVENT); /* * propagate PMC setup from libpfm to perfmon */ for (i=0; i < outp.pfp_pmc_count; i++) { pc[i].reg_num = outp.pfp_pmcs[i].reg_num; pc[i].reg_value = outp.pfp_pmcs[i].reg_value; /* * must disable 64-bit emulation on the PMC0 counter. * PMC0 is the only counter useable with PEBS. We must disable * 64-bit emulation to avoid getting interrupts for each * sampling period, PEBS takes care of this part. */ if (pc[i].reg_num == 0) pc[i].reg_flags = PFM_REGFL_NO_EMUL64; } /* * propagate PMD set from libpfm to perfmon */ for (i=0; i < outp.pfp_pmd_count; i++) pd[i].reg_num = outp.pfp_pmds[i].reg_num; /* * setup sampling period for first counter * we want notification on overflow, i.e., when buffer is full */ pd[0].reg_flags = PFM_REGFL_OVFL_NOTIFY; pd[0].reg_value = -SMPL_PERIOD; pd[0].reg_long_reset = -SMPL_PERIOD; pd[0].reg_short_reset = -SMPL_PERIOD; /* * Now program the registers */ if (pfm_write(fd, 0, PFM_RW_PMC, pc, outp.pfp_pmc_count * sizeof(*pc)) == -1) fatal_error("pfm_write error errno %d\n",errno); if (pfm_write(fd, 0, PFM_RW_PMD_ATTR, pd, outp.pfp_pmd_count * sizeof(*pd)) == -1) fatal_error("pfm_write(PMD) error errno %d\n",errno); /* * attach the session to CPU0 */ if (pfm_attach(fd, 0, 0) == -1) fatal_error("pfm_attach error errno %d\n",errno); /* * Create the child task */ signal(SIGCHLD, handler); if ((pid=fork()) == -1) fatal_error("Cannot fork process\n"); if (pid == 0) { /* child does not inherit context file descriptor */ close(fd); /* if child is too short-lived we may not measure it */ child(argv+1); } /* * start monitoring */ if (pfm_set_state(fd, 0, PFM_ST_START) == -1) fatal_error("pfm_set_state(start) error errno %d\n",errno); fds.fd = fd; fds.events = POLLIN; /* * core loop */ for(;done == 0;) { /* * Must use a timeout to avoid a race condition * with the SIGCHLD signal */ ret = poll(&fds, 1, 500); /* * if timeout expired, then check done */ if (ret == 0) continue; if (ret == -1) { if(ret == -1 && errno == EINTR) { warning("read interrupted, retrying\n"); continue; } fatal_error("poll failed: %s\n", strerror(errno)); } ret = read(fd, &msg, sizeof(msg)); if (ret == -1) fatal_error("cannot read perfmon msg: %s\n", strerror(errno)); switch(msg.type) { case PFM_MSG_OVFL: /* the sampling buffer is full */ process_smpl_buf(hdr); /* * reactivate monitoring once we are done with the samples * in syste-wide, interface guarantees monitoring is active * upon return from the pfm_restart() syscall */ if (pfm_set_state(fd, 0, PFM_ST_RESTART) == -1) fatal_error("pfm_set_state(restart) error errno %d\n",errno); break; default: fatal_error("unknown message type %d\n", msg.type); } } /* * cleanup child */ waitpid(pid, NULL, 0); /* * stop monitoring, this is required in order to guarantee that the PEBS buffer * header is updated with the latest position, such that we see see the final * samples */ if (pfm_set_state(fd, 0, PFM_ST_STOP) == -1) fatal_error("pfm_set_state(stop) error errno %d\n",errno); /* * check for any leftover samples. Must have monitoring stopped * for this operation to have guarantee it is up to date */ process_smpl_buf(hdr); /* * close session */ close(fd); /* * unmap sampling buffer and actually free the perfmon session */ munmap(buf_addr, (size_t)buf_arg.buf_size); return 0; }