示例#1
0
int main(int ac, char **av)
{
	int snappy_only = 0;

	if (av[1] && !strcmp(av[1], "-s")) {
		snappy_only = 1;
		av++;
	}

#ifdef SIMPLE_PMU
	pin_cpu(NULL);
	if(perfmon_available() == 0) {
		printf("no perfmon support\n");
		exit(1);
	}
#endif

	while (*++av) { 
		size_t size;
		char *map = mapfile(*av, O_RDONLY, &size);
		if (!map)
			err(*av);
		
		int i, v;
		for (i = 0; i < size; i += 4096)
			v = ((volatile char *)map)[i];

#ifdef COMP
		test_lz4(map, size, *av);
#endif

		test_snappy(map, size, *av);

		if (snappy_only)
			goto unmap;

#ifdef COMP		
		test_lzo(map, size, *av);
		test_zlib(map, size, *av, 1);
		test_zlib(map, size, *av, 3);
		//test_zlib(map, size, *av, 5);
		test_lzf(map, size, *av);
		test_quicklz(map, size, *av);
		test_fastlz(map, size, *av);
#endif		

#ifdef SNAPREF
		test_snapref(map, size, *av);
#endif

unmap:
		unmap_file(map, size);
		
	}
	return 0;
}
示例#2
0
void * driver1(void* arg)
{
	int i,j,k, iter_count = 0;
	uint64_t line_count=0;
	size_t * write_pntr;
	write_pntr = array;
// pin core affinity
	if(pin_cpu(pid, cpu_write) == -1) {
		err(1,"cannot set cpu write affinity");
		}
	else{
		printf(" write thread pinned to core %d to run\n",cpu_write);
		}
	fprintf(stderr,"from writer, total_lines = %ld, shared = %d\n",
		total_lines,shared);
	
	while(line_count < total_lines)
		{
		i = 0;
		while(exchange_flag == 1)
			{
			i++;
			}
		if(shared == 0)
			{
//			if(iter_count < 10)fprintf(stderr,"writer calling write kernel\n");
			write_pntr = write_buf(seg_size, write_pntr);
//			if(iter_count < 10)fprintf(stderr,"writer returned from write kernel\n");
			}
		else 
			{
//			if(iter_count < 10)fprintf(stderr,"writer calling read kernel\n");
			write_pntr = read_buf(seg_size, write_pntr);
//			if(iter_count < 10)fprintf(stderr,"writer returned from read kernel\n");
			}
		line_count += seg_size;
		iter_count++;
		exchange_flag = 1;
		}
	pthread_exit(NULL);
	
}
示例#3
0
void * driver0(void * arg)
{
	int i,j,k, iter_count =0;
	uint64_t line_count=0, init_tsc, end_tsc;
	size_t * read_pntr;

	read_pntr = array;
// pin core affinity
	if(pin_cpu(pid, cpu_read) == -1) {
		err(1,"cannot set cpu read affinity");
		}
	else{
		printf(" read thread pinned to core %d to run\n",cpu_read);
		}
	fprintf(stderr,"total_lines = %ld\n",total_lines);

	read_sum_tsc = 0;
	while(line_count < total_lines)
		{
		i = 0;
		while(exchange_flag == 0)
			{
			i++;
			}
		init_tsc = _rdtsc();
//		if(iter_count < 10)fprintf(stderr,"reader calling kernel\n");
		read_pntr = read_buf(seg_size, read_pntr);
//		if(iter_count < 10)fprintf(stderr,"reader returned from kernel\n");
		end_tsc = _rdtsc();
		read_sum_tsc += (end_tsc - init_tsc);
		line_count += seg_size;
		iter_count++;
		exchange_flag = 0;
		}
	fprintf(stderr," from read thread, line_count = %ld, TSC sum = %lu, latency = %g\n",
			line_count, read_sum_tsc,(double)read_sum_tsc/(double)line_count);
	pthread_exit(NULL);
}
示例#4
0
static void
measure(void)
{
	perf_event_desc_t *fds = NULL;
	int num_fds = 0;
	uint64_t values[3];
	ssize_t n;
	int i, ret;
	int pr[2], pw[2];
	pid_t pid;
	char cc = '0';

	ret = pfm_initialize();
	if (ret != PFM_SUCCESS)
		err(1, "cannot initialize libpfm");

	if (options.cpu == -1) {
		srandom(getpid());
		options.cpu = random() % sysconf(_SC_NPROCESSORS_ONLN);
	}

	ret = pipe(pr);
	if (ret)
		err(1, "cannot create read pipe");

	ret = pipe(pw);
	if (ret)
		err(1, "cannot create write pipe");

	ret = perf_setup_list_events(options.events, &fds, &num_fds);
	if (ret || !num_fds)
		exit(1);

	for(i=0; i < num_fds; i++) {
		fds[i].hw.disabled = 1;
		fds[i].hw.read_format = PERF_FORMAT_SCALE;

		fds[i].fd = perf_event_open(&fds[i].hw, 0, -1, -1, 0);
		if (fds[i].fd == -1)
			err(1, "cannot open event %d", i);
	}

	/*
 	 * Pin to CPU0, inherited by child process. That will enforce
 	 * the ping-pionging and thus stress the PMU context switch 
 	 * which is what we want
 	 */
	ret = pin_cpu(getpid(), options.cpu);
	if (ret)
		err(1, "cannot pin to CPU%d", options.cpu);

	printf("Both processes pinned to CPU%d, running for %d seconds\n", options.cpu, options.delay);

	/*
 	 * create second process which is not monitoring at the moment
 	 */
	switch(pid=fork()) {
		case -1:
			err(1, "cannot create child\n");
		case 0:
			/* do not inherit session fd */
			for(i=0; i < num_fds; i++)
				close(fds[i].fd);
			/* pr[]: write master, read child */
			/* pw[]: read master, write child */
			close(pr[1]); close(pw[0]);
			do_child(pr[0], pw[1]);
			exit(1);
	}

	close(pr[0]);
	close(pw[1]);

	/*
	 * Let's roll now
	 */
	prctl(PR_TASK_PERF_EVENTS_ENABLE);
	signal(SIGALRM, sig_handler);
	alarm(options.delay);

	/*
	 * ping pong loop
	 */
	while(!quit) {
		n = write(pr[1], "c", 1);
		if (n < 1)
			err(1, "write failed");
		n = read(pw[0], &cc, 1);
		if (n < 1)
			err(1, "read failed");
	}

	prctl(PR_TASK_PERF_EVENTS_DISABLE);

	for(i=0; i < num_fds; i++) {
		uint64_t val;
		double ratio;

		ret = read(fds[i].fd, values, sizeof(values));
		if (ret == -1)
			err(1,"pfm_read error");
		if (ret != sizeof(values))
			errx(1, "did not read correct amount %d", ret);

		val = perf_scale(values);
		ratio = perf_scale_ratio(values);

		if (ratio == 1.0)
			printf("%20"PRIu64" %s\n", val, fds[i].name);
		else
			if (ratio == 0.0)
				printf("%20"PRIu64" %s (did not run: competing session)\n", val, fds[i].name);
			else
				printf("%20"PRIu64" %s (scaled from %.2f%% of time)\n", val, fds[i].name, ratio*100.0);
	}
	/*
	 * kill child process
	 */
	kill(SIGKILL, pid);

	/*
 	 * close pipes
 	 */
	close(pr[1]);
	close(pw[0]);
	/*
	 * and destroy our session
	 */
	for(i=0; i < num_fds; i++)
		close(fds[i].fd);

	perf_free_fds(fds, num_fds);

	/* free libpfm resources cleanly */
	pfm_terminate();
}
示例#5
0
文件: libpfms.c 项目: naps62/CPD_PAPI
static void
pfms_thread_mainloop(void *arg)
{
	long k = (long )arg;
	uint32_t mycpu = (uint32_t)k;
	pfarg_ctx_t myctx, *ctx;
	pfarg_load_t load_args;
	int fd = -1;
	pfms_thread_t *td;
	sem_t *cmd_sem;
	int ret = 0;

	memset(&load_args, 0, sizeof(load_args));
	load_args.load_pid = mycpu;
	td = tds+mycpu;

	ret = pin_cpu(mycpu);
	dprint("CPU%u wthread created and pinned ret=%d\n", mycpu, ret);

	cmd_sem = &tds[mycpu].cmd_sem;

	for(;;) {
		dprint("CPU%u waiting for cmd\n", mycpu);

		sem_wait(cmd_sem);

		switch(td->cmd) {
			case CMD_NONE:
				ret = 0;
				break;

			case CMD_CTX:

				/*
				 * copy context to get private fd
				 */
				ctx = td->data;
				myctx = *ctx;

				fd = pfm_create_context(&myctx, NULL, NULL, 0);
				ret = fd < 0 ? -1 : 0;
				dprint("CPU%u CMD_CTX ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;

			case CMD_LOAD:
				ret = pfm_load_context(fd, &load_args);
				dprint("CPU%u CMD_LOAD ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_UNLOAD:
				ret = pfm_unload_context(fd);
				dprint("CPU%u CMD_UNLOAD ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_START:
				ret = pfm_start(fd, NULL);
				dprint("CPU%u CMD_START ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_STOP:
				ret = pfm_stop(fd);
				dprint("CPU%u CMD_STOP ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_WPMCS:
				ret = pfm_write_pmcs(fd,(pfarg_pmc_t *)td->data, td->ndata);
				dprint("CPU%u CMD_WPMCS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_WPMDS:
				ret = pfm_write_pmds(fd,(pfarg_pmd_t *)td->data, td->ndata);
				dprint("CPU%u CMD_WPMDS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_RPMDS:
				ret = pfm_read_pmds(fd,(pfarg_pmd_t *)td->data, td->ndata);
				dprint("CPU%u CMD_RPMDS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_CLOSE:
				dprint("CPU%u CMD_CLOSE fd=%d\n", mycpu, fd);
				ret = close(fd);
				fd = -1;
				break;
			default:
				break;
		}
		td->ret = ret;

		dprint("CPU%u td->ret=%d\n", mycpu, ret);

		barrier_wait(td->barrier);
	}
}
示例#6
0
int main(int argc, char ** argv)
{
	char * buf1;
	void * ret;
	size_t * array, ret_val = 0;
	size_t  array_stride;
	int i,j,k,cpu,cpu_run,line_count,stride, fd = -1;
	off_t offset = 0;
	int len=10240000, iter=100,mult=1,main_ret=0;
	double iterations;
	double *a, *b;
	size_t start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	__pid_t pid=0;
	size_t buf_size,jj,zero_loop, buf_by_num_seg,ind;
	size_t num_pages, page_size, var_size;
	int cpu_setsize;
	cpu_set_t mask;
//	size_t pattern[] = {4,1,5,2,6,3,7,0};
	int *pattern;
	int step, c;
	int* index, lc_by_num_seg,count, num_seg=32, huge=0;
	unsigned int bitmask, *intstar;

	page_size = 4096;

//	process input arguments

	if(argc < 6){
		fprintf(stderr,"the random walker requires at least 6 arguments (only the 7th in the list below is optional), there were %d\n",argc);
		usage();
		err(1,"insufficient invocation arguments");
		}

	while ((c = getopt(argc, argv, "i:r:l:s:S:m:L")) != -1) {
		switch(c) {
		case 'i':
			cpu = atoi(optarg);
			break;
		case 'r':
			cpu_run = atoi(optarg);
			break;
		case 'l':
			line_count = atoi(optarg);
			break;
		case 's':
			stride = atoi(optarg);
			break;
		case 'S':
			num_seg = atoi(optarg);
			break;
		case 'm':
			mult = atoi(optarg);
			break;
		case 'L':
			huge=1;
			page_size = 2 * 1024 * 1024;
			break;
		default:
			err(1, "unknown option %c", c);
		}
	}
	iter = iter*mult;


	var_size = sizeof(size_t);
	fprintf(stderr, "size_t in %zd bytes\n",var_size);
// pin core affinity

	if(pin_cpu(pid, cpu) == -1) {
		err(1,"failed to set affinity");
		}
	else{
		fprintf(stderr," process pinned to core %d\n",cpu);
		}

	pattern = (int*) malloc(num_seg*sizeof(int));
	if(pattern == NULL)
		{
		fprintf(stderr," failed to malloc pattern for size = %d\n",num_seg);
		err(1,"malloc of pattern failed");
		}

// calculate stride and buffer size
	stride = page_size*stride + 64;
	buf_size = (size_t)line_count*(size_t)stride;
	num_pages = buf_size/page_size + 2;
	buf_size = page_size*num_pages;
	array_stride = stride/sizeof(double);
	iterations = (double)iter*(double)len;

//    create index array for "random" patterna
	index = (int*)malloc(line_count*sizeof(int));
	if(index == NULL)
		{
		fprintf(stderr," failed to malloc index array for line_count of %d\n",line_count);
		err(1,"failed to malloc index");
		}
	if(num_seg == 1)
		{
		for(i=0; i<line_count-1; i++)index[i] = i;
		}
	else
		{
		
//	fprintf(stderr," calling rndm_list, n = %d\n",num_seg);
		rndm_list(pattern,num_seg);
		lc_by_num_seg = line_count/num_seg;
		if(lc_by_num_seg*num_seg != line_count)
			{
			fprintf(stderr," line count must be a multiple of the fifth argument num_seg = %d\n", num_seg);
			err(1," bad line_count");
			}
		count=0;
		buf_by_num_seg = buf_size/num_seg;
		for(i=0; i<lc_by_num_seg; i++)
			{
			step = 0;
			for(j=0;j<num_seg;j++)
				{
				count++;
				if(j == (num_seg-1) ) step = 1;
				ind = lc_by_num_seg*pattern[j];
				index[count]= (int) ind + i + step;
				if(index[count] >= line_count)
					printf(" count = %d, index = %d\n",count,index[count]);
				}
			}
		}
	index[0] = 0;

	for(i=0; i<line_count; i++)index[i] = index[i]*array_stride;

// malloc and initialize buffers
//    replace malloc call with a call to mmap

	if(huge == 0)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset);
	if(huge == 1)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset);
	if(buf1 == MAP_FAILED)
		{
		fprintf(stderr,"mmap failed\n");
		err(1,"mmap failed");
		} 
	fprintf(stderr," buf1 for a = %p\n",buf1);
	a = (double*) buf1;

	if(huge == 0)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset);
	if(huge == 1)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset);
	if(buf1 == MAP_FAILED)
		{
		fprintf(stderr,"mmap failed\n");
		err(1,"mmap failed");
		} 
	fprintf(stderr," buf1 for b = %p\n",buf1);
	b = (double*)buf1;


	zero_loop = buf_size/sizeof(double);
	fprintf(stderr, " buf_size = %zu, zero_loop = %zu, array_stride = %zd\n",buf_size,zero_loop,array_stride);
	
	for(i=0; i<zero_loop; i++) a[i] = 0;
	for(i=0; i<zero_loop; i++) b[i] = 0;
	fprintf(stderr," finished zeroing buf for a, b\n");


// pin core affinity
	if(pin_cpu(pid, cpu_run) == -1) {
		err(1,"cannot set cpu run affinity");
		}
	else{
		printf(" process pinned to core %d to run\n",cpu_run);
		}

// run the walker
	printf(" calling walker %d times which loops  %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size);
	call_start = _rdtsc();
	for(i=0;i<iter;i++){
		start = _rdtsc();
		ret_val = reader(len,line_count,a,b,index);
//	fprintf(stderr, " retval = %ld\n",ret_val);
		stop = _rdtsc();
		run_time = stop - start;
		}
	call_stop = _rdtsc();
	call_run_time = call_stop - call_start;
	printf(" run time = %zd\n",call_run_time);

//  printout
	printf(" average cycles per iteration = %f\n", (double)call_run_time/iterations);
	return main_ret;
}
示例#7
0
int main(int argc, char ** argv)
{
	char * buf1;
	void * ret;
	size_t ret_val = 0;
	size_t  array_stride;
	int rc0, rc1;
	int i,j,k, line_count=0,stride=0, fd = -1;
	off_t offset = 0;
	int len=10240000, iter=10,mult=1,main_ret=0;
	double iterations;
	size_t start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	size_t buf_size,jj,zero_loop, buf_by_num_seg,ind;
	size_t num_pages, page_size, var_size;
	int cpu_setsize;
	cpu_set_t mask;
//	size_t pattern[] = {4,1,5,2,6,3,7,0};
	int *pattern;
	int step, c;
	int* index, lc_by_num_seg,count, num_seg=32, huge=0;
	unsigned int bitmask, *intstar;
	void *arg;

	pthread_t * Thread_dat;

	page_size = 4096;
	shared = 0;

//	process input arguments

	if(argc < 6){
		fprintf(stderr,"the random walker requires at least 6 arguments (only the 7th in the list below is optional), there were %d\n",argc);
		usage();
		err(1,"insufficient invocation arguments");
		}

	while ((c = getopt(argc, argv, "i:r:w:l:S:m:L:sh")) != -1) {
		switch(c) {
		case 'i':
			cpu = atoi(optarg);
			break;
		case 'r':
			cpu_read = atoi(optarg);
			break;
		case 'w':
			cpu_write = atoi(optarg);
			break;
		case 'l':
			seg_size = atoi(optarg);
			break;
		case 's':
			shared = 1;
			break;
		case 'S':
			num_seg = atoi(optarg);
			break;
		case 'm':
			mult = atoi(optarg);
			break;
		case 'L':
			line_count = atoi(optarg);
			break;
		case 'h':
			usage();
			exit(1);
		default:
			err(1, "unknown option %c", c);
		}
	}
	iter = iter*mult;
	total_lines = len*iter;
	fprintf(stderr," seg_size = %d, line_count = %d, total_lines = %ld\n",
		seg_size, line_count, total_lines);

	var_size = sizeof(size_t);
	fprintf(stderr, "size_t in %zd bytes\n",var_size);
	Thread_dat = (pthread_t *) malloc(MAX_THREAD*sizeof(pthread_t));
	if(Thread_dat == NULL)
		err(1, "malloc of Thread failed");

// pin core affinity

	if(pin_cpu(pid, cpu) == -1) {
		err(1,"failed to set affinity");
		}
	else{
		fprintf(stderr," process pinned to core %d\n",cpu);
		}

	pattern = (int*) malloc(num_seg*sizeof(int));
	if(pattern == NULL)
		{
		fprintf(stderr," failed to malloc pattern for size = %d\n",num_seg);
		err(1,"malloc of pattern failed");
		}

// calculate stride and buffer size
	stride = page_size*stride + 64;
	buf_size = (size_t)line_count*(size_t)stride;
	num_pages = buf_size/page_size + 2;
	buf_size = page_size*num_pages;
	array_stride = stride/sizeof(size_t *);
	iterations = (double)iter*(double)len;

//    create index array for "random" patterna
	index = (int*)malloc(line_count*sizeof(int));
	if(index == NULL)
		{
		fprintf(stderr," failed to malloc index array for line_count of %d\n",line_count);
		err(1,"failed to malloc index");
		}
	if(num_seg == 1)
		{
		for(i=0; i<line_count; i++)index[i] = i;
		}
	else
		{
		
//	fprintf(stderr," calling rndm_list, n = %d\n",num_seg);
		rndm_list(pattern,num_seg);
		lc_by_num_seg = line_count/num_seg;
		if(lc_by_num_seg*num_seg != line_count)
			{
			fprintf(stderr," line count must be a multiple of the fifth argument num_seg = %d\n", num_seg);
			err(1," bad line_count");
			}
		count=0;
		buf_by_num_seg = buf_size/num_seg;
		for(i=0; i<lc_by_num_seg; i++)
			{
			step = i*num_seg;
			for(j=0;j<num_seg;j++)
				{
				count++;
				ind = lc_by_num_seg*pattern[j];
				index[count]= (int) pattern[j] + step;
				if(index[count] >= line_count)
					printf(" count = %d, index = %d\n",count,index[count]);
				}
			count++;
			index[count]=(i+1)*num_seg;
			}
		index[count] = 0;
		}
        index[0] = 0;
//      test index map for every value between 0 and line_count-1 showing up once

        for(i=0;i<line_count;i++)
                {
                if(index[i] > line_count)err(1,"index[%d] = %d",i,index[i]);
//              if(i < 128 )fprintf(stderr,"index[%d] = %d\n",i,index[i]);
//              if(i > line_count - 128 )fprintf(stderr,"index[%d] = %d\n",i,index[i]);
                index_test[index[i]]++;
                }

        bad = 0;
        for(i=0;i<line_count;i++)
                {
                if(index_test[i] != 1)
                        {
                        bad++;
                        if(bad < 64)fprintf(stderr,"index_test[%d] = %d\n",i,index_test[i]);
                        }
                }
        fprintf(stderr,"bad = %d\n",bad);


// malloc and initialize buffers

	buf1 = (char *)malloc(buf_size + 4096 );

//    replace malloc call with a call to mmap
/*
	if(huge == 0)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON , fd, offset);
	if(huge == 1)
	buf1 = (char*) mmap(NULL,buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB , fd, offset);
	if(buf1 == MAP_FAILED)
		{
		fprintf(stderr,"mmap failed\n");
		err(1,"mmap failed");
		} 
*/
	fprintf(stderr," buf1 = %p\n",buf1);
//	buf1 = buf1 + (0x1000 - (size_t)buf1 & 0xFFF) ;
//	fprintf(stderr," buf1 = %p\n",buf1);

	zero_loop = buf_size/(size_t)var_size;
	fprintf(stderr, " buf_size = %zu, zero_loop = %zu, array_stride = %zd\n",buf_size,zero_loop,array_stride);
//	for(i=0;i<buf_size;i++)buf1[i]=0;   //touch every page to ensure creation
	array = (size_t *) buf1;
//	for(i=0; i<zero_loop; i++) array[i] = 0;
	ret = memset(buf1, 0, (size_t)buf_size);
	fprintf(stderr," finished zeroing buf ret = %p\n",ret);

//	for(jj=0;jj<line_count-1; jj++)array[jj*(size_t)array_stride] = (size_t) &array[(size_t)array_stride*(jj+1)];
	for(jj=0;jj<line_count-1;jj++)array[index[jj]*array_stride] = (size_t)&array[index[jj+1]*array_stride];
	fprintf(stderr," target of last element in loop = %zx\n",(size_t)(array[line_count-1]-(size_t)buf1));
	array[(size_t)array_stride*index[line_count-1]] = (size_t)&array[0];

//	for(jj=0; jj< line_count; jj+=8) printf(", jj = %d, array[jj]-&array[0]/array_stride = %d\n",jj,(array[jj]-(size_t)&array[0])/array_stride);

// run the walker
	printf(" invoking reader %d times which loops  %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size);

	exchange_flag = 0;
	rc0 = pthread_create(&Thread_dat[0], NULL, driver0, (void *)arg);
	if(rc0)
		err(1,"failed to start thread for driver0");

	printf(" invoking writer %d times which loops  %d times on buffer of %d lines with a stride of %d, for a total size of %zu\n",iter,len,line_count,stride,buf_size);
	rc1 = pthread_create(&Thread_dat[1], NULL, driver1, (void *)arg);
	if(rc1)
		err(1,"failed to start thread for driver1");
	printf(" run time = %zd\n",call_run_time);

//  printout
	printf(" average cycles per iteration = %f\n", (double)call_run_time/iterations);
	pthread_exit(NULL);
	return main_ret;
}
示例#8
0
void 
main(int argc, char ** argv)
{
	double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0;
	char * buf1, *buf2, *buf3;
	int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=100, c_val;
	int len,mem_level, level_size[4], cpu, cpu_run, bytes_per,scale;
	unsigned long long start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	__pid_t pid=0;
	int cpu_setsize;
	cpu_set_t mask;


//	process input arguments

	if(argc < 3 ){
		printf("triad driver needs at least 3 arguments, cpu_init, cpu_run, cache_level, [call count multiplier  def = 1], [offset a, offset_b, offset_c  defaults = 0] \n");
		printf(" argc = %d\n",argc);
		usage();
		err(1, "bad arguments");
		}


	len = L4;
        while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) {
                switch(c_val) {
                case 'i':
                        cpu = atoi(optarg);
                        break;
                case 'r':
                        cpu_run = atoi(optarg);
                        break;
                case 'l':
                        mem_level = atoi(optarg);
                        break;
                case 'm':
                        mult = atoi(optarg);
                        break;
                case 'a':
                        offset_a = atoi(optarg);
                        break;
                case 'b':
                        offset_b = atoi(optarg);
                        break;
                case 'c':
                        offset_c = atoi(optarg);
                        break;
                default:
                        err(1, "unknown option %c", c_val);
                }
        }
        iter = iter*mult;



// pin core affinity for initialization
        if(pin_cpu(pid, cpu) == -1) {
                err(1,"failed to set affinity");
                }
        else{
                fprintf(stderr," process pinned to core %d for initialization\n",cpu);
                }


// set buffer sizes and loop tripcounts based on memory level
	level_size[0]=L1;
	level_size[1]=L2;
	level_size[2]=L3;
	level_size[3]=L4;
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult);
	len = level_size[mem_level]/32;
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult);
	scale = level_size[3]/(32*len);
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d, scale = %d\n",len, mem_level, iter,mult,scale);
	iter =iter*scale*mult;
	
	fprintf(stderr, "len = %d, mem_level = %d, iter = %d, mult = %d\n",len, mem_level, iter,mult);

// malloc and initialize buffers
	buf1 = malloc(sizeof(double)*len + 4096 + 1024);
	fprintf(stderr," buf1 = %p\n",buf1);
	buf1 = buf1 + (0x1000 - (unsigned int)buf1 & 0xFFF) + offset_a;
	fprintf(stderr," buf1 = %p\n",buf1);
	a = (double *) buf1;
	buf2 = malloc(sizeof(double)*len + 4096 + 1024);
	fprintf(stderr," buf2 = %p\n",buf2);
	buf2 = buf2 + (0x1000 - (unsigned int)buf2 & 0xFFF) + offset_b;
	fprintf(stderr," buf2 = %p\n",buf2);
	b = (double *) buf2;
	buf3 = malloc(sizeof(double)*len + 4096 + 1024);
	fprintf(stderr," buf3 = %p\n",buf3);
	buf3 = buf3 + (0x1000 - (unsigned int)buf3 & 0xFFF) + offset_c;
	fprintf(stderr," buf3 = %p\n",buf3);
	c = (double *) buf3;

	for(i=0;i<len;i++){
		a[i] = 0.;
		b[i] = 10.;
		c[i] = 10.;
		}

// pin core affinity for triad run
        if(pin_cpu(pid, cpu_run) == -1) {
                err(1,"failed to set affinity");
                }
        else{
                fprintf(stderr," process pinned to core %d for triad run\n",cpu_run);
                }

// run the triad
	printf(" calling triad %d times with len = %d\n",iter,len);
	call_start = _rdtsc();
	for(i=0;i<iter;i++){
		start = _rdtsc();
		bytes_per = triad(len,xx,a,b,c);
		stop = _rdtsc();
		run_time = stop - start;
		xx+=0.01;
		total_bytes +=len*bytes_per;
		bw=(double)(len*bytes_per)/(double)run_time;
		if(bw > best_bw) best_bw = bw;
		}
	call_stop = _rdtsc();
	call_run_time = call_stop - call_start;
	avg_bw=(double)(total_bytes)/(double)call_run_time;
//  printout
	printf(" transfering %lld bytes from memory level %d took %lld cycles/call and a total of %lld\n",total_bytes,mem_level,run_time,call_run_time);
	printf(" average bytes/cycle = %f\n", avg_bw);
	printf(" best bytes/cycle = %f\n",best_bw);
}
示例#9
0
void PAPI_HW_COUNTER_open(int tid){
    // set events to measure
    int *Events;
    int EventCode;
    int event_ctr = 0;
    int retval;
  #ifdef MEASURE_TIME
  #endif

  #ifdef MEASURE_CPI
    thr_vars[tid].papi_idx_inst = thr_vars[tid].num_events++;
    thr_vars[tid].papi_idx_cyc = thr_vars[tid].num_events++;
  #endif

  #ifdef MEASURE_MEMACC
    thr_vars[tid].papi_idx_load = thr_vars[tid].num_events++;
    thr_vars[tid].papi_idx_store = thr_vars[tid].num_events++;
  #endif
    
  #ifdef MEASURE_LLCMISS
    thr_vars[tid].papi_idx_llcmiss = thr_vars[tid].num_events++;
  #endif

  #ifdef MEASURE_ICACHEMISS
    thr_vars[tid].papi_idx_icachemiss = thr_vars[tid].num_events++;
  #endif

  #ifdef MEASURE_DCACHEMISS
    thr_vars[tid].papi_idx_l1dcm = thr_vars[tid].num_events++;
    thr_vars[tid].papi_idx_l1dca = thr_vars[tid].num_events++;
  #endif 

  #ifdef MEASURE_ENERGY
  #endif
    
    event_ctr = 0;  // reset event counter

    if((Events=(int*)malloc(sizeof(int)*thr_vars[tid].num_events)) == NULL){
        printf("ERROR: Failed to allocate memory for Events.");
    }
    if((thr_vars[tid].values=(long long int*)malloc(sizeof(long long)*thr_vars[tid].num_events)) == NULL){
        printf("ERROR: Failed to allocate memory for Events.");
    }

  #ifdef __ARM_ARCH_7A__
    // pin processor only on arm arch.
    pid_t pid = getpid();
    int core = 0;
    printf("Pinning thread %d to cores %d..%d\n", pid, 0, 0);
    printf("Observe in terminal via \"ps -p <PID> -L -o pid,tid,psr\"\n");
    pin_cpu(pid, core);
    printf("Pinned to core %d\n", core);
  #endif

    // Open file to output
    char filename_id[2*sizeof(int)];
    snprintf(filename_id, sizeof(filename_id),"%d",tid);
    char* filename_w_id;
    filename_w_id=(char*)malloc(strlen(OUTFILEID)+strlen(OUTFILEEXT)+strlen(filename_id)+1);
    strcpy(filename_w_id, OUTFILEID);
    strcat(filename_w_id, filename_id);
    strcat(filename_w_id, OUTFILEEXT);
    
    thr_vars[tid].f=fopen(filename_w_id, "w");
    if (thr_vars[tid].f == NULL){
        printf("failed to open file %s.\n", filename_w_id);
        exit(1);
    }

    // Measure clock frequency
    long long elapsed_cyc;
    elapsed_cyc = PAPI_get_real_cyc();
    sleep(1);
    elapsed_cyc = PAPI_get_real_cyc()-elapsed_cyc;
    thr_vars[tid].PAPI_CLOCK_RATE = elapsed_cyc;
    printf("Measured clock frequency: %.0lld Hz\n",thr_vars[tid].PAPI_CLOCK_RATE);

    // Set EventSet
    thr_vars[tid].EventSet = PAPI_NULL;/*EventSet*/
    retval=PAPI_create_eventset(&(thr_vars[tid].EventSet));
    if (retval != PAPI_OK){
        papi_fail(__FILE__, __LINE__, "PAPI_create_eventset()", retval);
    }


  #ifdef MEASURE_TIME
  #endif

  #ifdef MEASURE_CPI
    retval = PAPI_event_name_to_code( PAPI_INST , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, inst", retval);
    }
    Events[event_ctr++] = EventCode;
   
    retval = PAPI_event_name_to_code( PAPI_CYC , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, cyc", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif

  #ifdef MEASURE_MEMACC
    retval = PAPI_event_name_to_code( PAPI_MEM_LOAD , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, loads", retval);
    }
    Events[event_ctr++] = EventCode;
    
    retval=PAPI_event_name_to_code( PAPI_MEM_STORE , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, stores", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif
  
  #ifdef MEASURE_LLCMISS
    retval = PAPI_event_name_to_code( PAPI_LLC_MISS , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, llc miss", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif

  #ifdef MEASURE_ICACHEMISS
    retval = PAPI_event_name_to_code( PAPI_IC_MISS , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, llc miss", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif 

  #ifdef MEASURE_DCACHEMISS
    retval = PAPI_event_name_to_code( PAPI_L1_DC_MISS , &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, Level 1 data cache misses", retval);
    }
    Events[event_ctr++] = EventCode;

    retval = PAPI_event_name_to_code( PAPI_L1_DC_ACCESS, &EventCode );
    if (retval != PAPI_OK ) {
        papi_fail(__FILE__, __LINE__,
                "PAPI_event_name_to_code, Level 1 data cache accesses", retval);
    }
    Events[event_ctr++] = EventCode;
  #endif

  #ifdef MEASURE_ENERGY
    printf("Probing all RAPL events\n");

    thr_vars[tid].numcmp = PAPI_num_components();

    for(thr_vars[tid].cid=0; thr_vars[tid].cid<thr_vars[tid].numcmp; thr_vars[tid].cid++) {
        if ( (thr_vars[tid].cmpinfo = PAPI_get_component_info(thr_vars[tid].cid)) == NULL) {
            papi_fail(__FILE__, __LINE__,"PAPI_get_component_info failed\n", 0);
        }
        if (strstr(thr_vars[tid].cmpinfo->name,"rapl")) {
            thr_vars[tid].rapl_cid=thr_vars[tid].cid;
            printf("Found rapl component at cid %d.\n",thr_vars[tid].rapl_cid);
            if (thr_vars[tid].cmpinfo->disabled) {
                printf("RAPL component disabled: %s\n",
                        thr_vars[tid].cmpinfo->disabled_reason);
                exit(EXIT_FAILURE);
            }
            break;
        }
    }

    if (thr_vars[tid].cid==thr_vars[tid].numcmp) {
        // Component not found:
        papi_fail(__FILE__,__LINE__,"No rapl component found\n",0);
    }

    retval = PAPI_create_eventset( &(thr_vars[tid].EnergyEventSet) );
    if (retval != PAPI_OK){
        papi_fail(__FILE__,__LINE__, "PAPI_create_eventset()", retval);
    }

  // Add all events:
  int r;
  thr_vars[tid].code = PAPI_NATIVE_MASK;
  r = PAPI_enum_cmp_event( &(thr_vars[tid].code), PAPI_ENUM_FIRST, thr_vars[tid].rapl_cid );
  while ( r == PAPI_OK ) {
     retval = PAPI_event_code_to_name( thr_vars[tid].code, thr_vars[tid].event_names[thr_vars[tid].num_energy_events] );
     if ( retval != PAPI_OK ) {
        printf("Error translating %#x\n",thr_vars[tid].code);
        papi_fail(__FILE__, __LINE__, 
                  "PAPI_event_code_to_name", retval );
     }

     printf("Found event: %s\n", thr_vars[tid].event_names[thr_vars[tid].num_energy_events]);

     retval = PAPI_get_event_info(thr_vars[tid].code,&(thr_vars[tid].evinfo));
     if (retval != PAPI_OK) {
        papi_fail(__FILE__, __LINE__,
                  "Error getting event info\n",retval);
     }
	
     strncpy(thr_vars[tid].units[thr_vars[tid].num_energy_events],thr_vars[tid].evinfo.units,PAPI_MIN_STR_LEN);
     thr_vars[tid].data_type[thr_vars[tid].num_energy_events] = thr_vars[tid].evinfo.data_type;

     retval = PAPI_add_event(thr_vars[tid].EnergyEventSet, thr_vars[tid].code);
     if (retval != PAPI_OK ) {
         papi_fail( __FILE__, __LINE__, "PAPI_add_event()", retval);
     }
  	      
     r = PAPI_enum_cmp_event( &(thr_vars[tid].code), PAPI_ENUM_EVENTS, thr_vars[tid].rapl_cid );
     thr_vars[tid].num_energy_events++;
  }

    if((thr_vars[tid].energy_values=(long long int*)malloc(sizeof(long long)*thr_vars[tid].num_energy_events)) == NULL){
        printf("ERROR: Failed to allocate memory for Events.");
    }

  #endif

  #ifdef MEASURE_HW_COUNTER
    int k;
    for(k = 0; k < thr_vars[tid].num_events; k++){
        retval = PAPI_add_event(thr_vars[tid].EventSet, Events[k]);
        if (retval != PAPI_OK ) {
           printf("At event %d:\n",k); 
           papi_fail( __FILE__, __LINE__, "PAPI_add_event()", retval);
        }
    }

    retval=PAPI_start(thr_vars[tid].EventSet);
    if (retval != PAPI_OK){
        papi_fail(__FILE__, __LINE__, "PAPI_start()", retval);
    }
  #endif

  #ifdef MEASURE_ENERGY
    retval=PAPI_start(thr_vars[tid].EnergyEventSet);
    if (retval != PAPI_OK){
        papi_fail(__FILE__, __LINE__, "PAPI_start() on energy", retval);
    }
  #endif
}
示例#10
0
void 
main(int argc, char ** argv)
{
	double *a, *b, *c, xx=0.01, bw, avg_bw, best_bw=-1.0;
	char * buf1, *buf2, *buf3;
	int i,j,k,offset_a=0,offset_b=0,offset_c=0, mult=1,iter=1000, c_val;
	int len,num_pages, num_lines, cpu_run,scale;
	u64 start, stop, run_time, call_start, call_stop, call_run_time,total_bytes=0;
	__pid_t pid=0;
	int cpu_setsize;
	cpu_set_t mask;
	int *buff;
	size_t buf_size;
	off_t offset = 0;
	int fd = -1;

//	process input arguments

	if(argc < 3 ){
		printf("affinity needs 2 arguments, cpu_run, call count multiplier  def = 1\n");
		printf(" argc = %d\n",argc);
		usage();
		err(1, "bad arguments");
		}


        while ((c_val = getopt(argc, argv, "i:r:l:m:a:b:c")) != -1) {
                switch(c_val) {
                case 'r':
                        cpu_run = atoi(optarg);
                        break;
                case 'm':
                        mult = atoi(optarg);
                        break;
                default:
                        err(1, "unknown option %c", c_val);
                }
        }



// pin core affinity for initialization
        if(pin_cpu(pid, cpu_run) == -1) {
                err(1,"failed to set affinity");
                }
        else{
                fprintf(stderr," process pinned to core %d for triad run\n",cpu_run);
                }


// set buffer sizes and loop tripcount
	buf_size = (u64)4096*(u64)num_pages;
	num_lines=64*num_pages;
        iter = iter*mult;

// malloc and initialize buffers

	printf(" starting malloc loop of %d iterations with buf_size = %ld, num_lines = %d\n",iter,buf_size, num_lines);
	call_start = _rdtsc();
	for(i=0;i<iter;i++){
		start = _rdtsc();
	        if(pin_cpu(pid, cpu_run) == -1) {
        	        err(1,"failed to set affinity");
                	}
	        else{
        	        fprintf(stderr," process pinned to core %d for triad run\n",cpu_run);
                	}
		stop = _rdtsc();
		run_time = stop - start;
		}
	call_stop = _rdtsc();
	call_run_time = call_stop - call_start;
//  printout
	printf(" allocating %lld bytes and initializing and freeing took %lld cycles\n",(u64)len*(u64)iter,run_time);
}
示例#11
0
int
main(int argc, char **argv)
{
	pfmlib_input_param_t inp;
	pfmlib_output_param_t outp;
	pfmlib_core_input_param_t mod_inp;
	pfmlib_options_t pfmlib_options;
	pfarg_pmr_t pc[NUM_PMCS];
	pfarg_pmd_attr_t pd[NUM_PMDS];
	pfarg_sinfo_t sif;
	struct pollfd fds;
	smpl_arg_t buf_arg;
	pfarg_msg_t msg;
	smpl_hdr_t *hdr;
	void *buf_addr;
	uint64_t pebs_size;
	pid_t pid;
	int ret, fd, type;
	unsigned int i;
	uint32_t ctx_flags;

	if (argc < 2)
		fatal_error("you need to pass a program to sample\n");

	if (pfm_initialize() != PFMLIB_SUCCESS)
		fatal_error("libpfm intialization failed\n");

	/*
	 * check we are on an Intel Core PMU
	 */
	pfm_get_pmu_type(&type);
	if (type != PFMLIB_INTEL_CORE_PMU && type != PFMLIB_INTEL_ATOM_PMU)
		fatal_error("This program only works with an Intel Core processor\n");

	/*
	 * pass options to library (optional)
	 */
	memset(&pfmlib_options, 0, sizeof(pfmlib_options));
	pfmlib_options.pfm_debug   = 0; /* set to 1 for debug */
	pfmlib_options.pfm_verbose = 1; /* set to 1 for verbose */
	pfm_set_options(&pfmlib_options);

	memset(pd, 0, sizeof(pd));
	memset(pc, 0, sizeof(pc));
	memset(&inp, 0, sizeof(inp));
	memset(&outp, 0, sizeof(outp));
	memset(&mod_inp, 0, sizeof(mod_inp));
	memset(&sif, 0, sizeof(sif));

	memset(&buf_arg, 0, sizeof(buf_arg));

	memset(&fds, 0, sizeof(fds));

	/*
	 * search for our sampling event
	 */
	if (pfm_find_full_event(SMPL_EVENT, &inp.pfp_events[0]) != PFMLIB_SUCCESS)
		fatal_error("cannot find sampling event %s\n", SMPL_EVENT);

	inp.pfp_event_count = 1;
	inp.pfp_dfl_plm = PFM_PLM3;

	/*
	 * important: inform libpfm we do use PEBS
	 */
	mod_inp.pfp_core_pebs.pebs_used = 1;

	/*
	 * sampling buffer parameters
	 */
	pebs_size = 3 * getpagesize();
	buf_arg.buf_size = pebs_size;

	/*
	 * sampling period cannot use more bits than HW counter can supoprt
	 */
	buf_arg.cnt_reset = -SMPL_PERIOD;

	/*
	 * We want a system-wide context for sampling
	 */
	ctx_flags = PFM_FL_SYSTEM_WIDE | PFM_FL_SMPL_FMT;

	/*
	 * trigger notification (interrupt) when reaching the very end of
	 * the buffer
	 */
	buf_arg.intr_thres = (pebs_size/sizeof(smpl_entry_t))*90/100;

	/*
 	 * we want to measure CPU0, thus we pin ourself to the CPU before invoking
 	 * perfmon. This ensures that the sampling buffer will be allocated on the
 	 * same NUMA node.
 	 */
	ret = pin_cpu(getpid(), 0);
	if (ret)
		fatal_error("cannot pin on CPU0");

	/*
	 * create session and sampling buffer
	 */
	fd = pfm_create(ctx_flags, &sif, FMT_NAME, &buf_arg, sizeof(buf_arg));
	if (fd == -1) {
		if (errno == ENOSYS) {
			fatal_error("Your kernel does not have performance monitoring support!\n");
		}
		fatal_error("cannot create session %s, maybe you do not have the PEBS sampling format in the kernel.\nCheck /sys/kernel/perfmon/formats\n", strerror(errno));
	}

	/*
	 * map buffer into our address space
	 */
	buf_addr = mmap(NULL, (size_t)buf_arg.buf_size, PROT_READ, MAP_PRIVATE, fd, 0);
	printf("session [%d] buffer mapped @%p\n", fd, buf_addr);
	if (buf_addr == MAP_FAILED)
		fatal_error("cannot mmap sampling buffer errno %d\n", errno);

	hdr = (smpl_hdr_t *)buf_addr;

	printf("pebs_base=0x%llx pebs_end=0x%llx index=0x%llx\n"
	       "intr=0x%llx version=%u.%u\n"
	       "entry_size=%zu ds_size=%zu\n",
			(unsigned long long)hdr->ds.pebs_buf_base,
			(unsigned long long)hdr->ds.pebs_abs_max,
			(unsigned long long)hdr->ds.pebs_index,
			(unsigned long long)hdr->ds.pebs_intr_thres,
			PFM_VERSION_MAJOR(hdr->version),
			PFM_VERSION_MINOR(hdr->version),
			sizeof(smpl_entry_t),
			sizeof(hdr->ds));

	if (PFM_VERSION_MAJOR(hdr->version) < 1)
		fatal_error("invalid buffer format version\n");

	/*
	 * get which PMC registers are available
	 */
	detect_unavail_pmu_regs(&sif, &inp.pfp_unavail_pmcs, NULL);

	/*
	 * let libpfm figure out how to assign event onto PMU registers
	 */
	if (pfm_dispatch_events(&inp, &mod_inp, &outp, NULL) != PFMLIB_SUCCESS)
		fatal_error("cannot assign event %s\n", SMPL_EVENT);


	/*
	 * propagate PMC setup from libpfm to perfmon
	 */
	for (i=0; i < outp.pfp_pmc_count; i++) {
		pc[i].reg_num   = outp.pfp_pmcs[i].reg_num;
		pc[i].reg_value = outp.pfp_pmcs[i].reg_value;

		/*
		 * must disable 64-bit emulation on the PMC0 counter.
		 * PMC0 is the only counter useable with PEBS. We must disable
		 * 64-bit emulation to avoid getting interrupts for each
		 * sampling period, PEBS takes care of this part.
		 */
		if (pc[i].reg_num == 0)
			pc[i].reg_flags = PFM_REGFL_NO_EMUL64;
	}

	/*
	 * propagate PMD set from libpfm to perfmon
	 */
	for (i=0; i < outp.pfp_pmd_count; i++)
		pd[i].reg_num = outp.pfp_pmds[i].reg_num;

	/*
	 * setup sampling period for first counter
	 * we want notification on overflow, i.e., when buffer is full
	 */
	pd[0].reg_flags = PFM_REGFL_OVFL_NOTIFY;
	pd[0].reg_value = -SMPL_PERIOD;

	pd[0].reg_long_reset = -SMPL_PERIOD;
	pd[0].reg_short_reset = -SMPL_PERIOD;
	
	/*
	 * Now program the registers
	 */
	if (pfm_write(fd, 0, PFM_RW_PMC, pc, outp.pfp_pmc_count * sizeof(*pc)) == -1)
		fatal_error("pfm_write error errno %d\n",errno);

	if (pfm_write(fd, 0, PFM_RW_PMD_ATTR, pd, outp.pfp_pmd_count * sizeof(*pd)) == -1)
		fatal_error("pfm_write(PMD) error errno %d\n",errno);

	/*
	 *  attach the session to CPU0
	 */
	if (pfm_attach(fd, 0, 0) == -1)
		fatal_error("pfm_attach error errno %d\n",errno);

	/*
	 * Create the child task
	 */
	signal(SIGCHLD, handler);

	if ((pid=fork()) == -1)
		fatal_error("Cannot fork process\n");

	if (pid == 0) {
		/* child does not inherit context file descriptor */
		close(fd);

		/* if child is too short-lived we may not measure it */
		child(argv+1);
	}

	/*
	 * start monitoring
	 */
	if (pfm_set_state(fd, 0, PFM_ST_START) == -1)
		fatal_error("pfm_set_state(start) error errno %d\n",errno);

	fds.fd = fd;
	fds.events = POLLIN;
	/*
	 * core loop
	 */
	for(;done == 0;) {
		/*
		 * Must use a timeout to avoid a race condition
		 * with the SIGCHLD signal
		 */
		ret = poll(&fds, 1, 500);

		/*
		 * if timeout expired, then check done
		 */
		if (ret == 0)
			continue;

		if (ret == -1) {
			if(ret == -1 && errno == EINTR) {
				warning("read interrupted, retrying\n");
				continue;
			}
			fatal_error("poll failed: %s\n", strerror(errno));
		}

		ret = read(fd, &msg, sizeof(msg));
		if (ret == -1)
			fatal_error("cannot read perfmon msg: %s\n", strerror(errno));

		switch(msg.type) {
			case PFM_MSG_OVFL: /* the sampling buffer is full */
				process_smpl_buf(hdr);
				/*
				 * reactivate monitoring once we are done with the samples
				 * in syste-wide, interface guarantees monitoring is active
				 * upon return from the pfm_restart() syscall
				 */
				if (pfm_set_state(fd, 0, PFM_ST_RESTART) == -1)
					fatal_error("pfm_set_state(restart) error errno %d\n",errno);
				break;
			default: fatal_error("unknown message type %d\n", msg.type);
		}
	}
	/*
	 * cleanup child
	 */
	waitpid(pid, NULL, 0);

	/*
	 * stop monitoring, this is required in order to guarantee that the PEBS buffer
	 * header is updated with the latest position, such that we see see the final
	 * samples
	 */
	if (pfm_set_state(fd, 0, PFM_ST_STOP) == -1)
		fatal_error("pfm_set_state(stop) error errno %d\n",errno);

	/*
	 * check for any leftover samples. Must have monitoring stopped
	 * for this operation to have guarantee it is up to date
	 */
	process_smpl_buf(hdr);

	/*
	 * close session
	 */
	close(fd);

	/*
	 * unmap sampling buffer and actually free the perfmon session
	 */
	munmap(buf_addr, (size_t)buf_arg.buf_size);

	return 0;
}