Example #1
0
int main (int argc, char * argv[])
{
  long mis_preds = 0;
  long num_branches = 0;
  uint32_t pc = 0;
  bool outcome = false;

  // Initialize the predictor
  init_predictor ();

  if (argc == 2)
    setup_trace (argv[1]);
  else
    setup_trace (NULL);

  // Read the number of instructions from the trace
  uint32_t stat_num_insts = 0;
  if (fread (&stat_num_insts, sizeof (uint32_t), 1, stream) != 1) {
    printf ("Could not read intput file\n");
    return 1;
  }
  stat_num_insts = ntohl (stat_num_insts);

  // Read each branch from the trace
  while (read_branch (&pc, &outcome)) {

    pc = ntohl (pc);

    num_branches ++;
    
    // Make a prediction and compare with actual outcome
    if (make_prediction (pc) != outcome)
      mis_preds ++;

    // Train the predictor
    train_predictor (pc, outcome);
  }

  // Print out the mispredict statistics
  printf ("Branches\t\t%10d\n", num_branches);
  printf ("Incorrect\t\t%10d\n", mis_preds);
  float mis_pred_rate = 100*(float)mis_preds / float(num_branches);
  printf ("100*wrong_predicts/total branches is %8d / %8d = %7.3f\n", mis_preds, num_branches, mis_pred_rate);

  if (argc == 2)
    close_trace ();
  
  return 0;
}
void
tiled_mergesort(unsigned int a[], int N)
{
	/* track keeps an eye on where i started last */
	/* outer track keeps and eye on which cache_sized/2 segemnt i started in last */
	int i,j; /* indices for the first array */
	
	int level1_count;
	int level2_count;
	int extra_level1_count = 0;
	int extra_level2;
	int final_extra;


	unsigned int* aux_data;
	unsigned int* aux;

	unsigned int* level2_start;
	unsigned int* level2_aux_start;

	unsigned int minusA;

	unsigned int** level1_finish;
	unsigned int** level1_other;

	int odd = 0;

	describe_predictor(&global_predictor[0], "forwards middle");
	describe_predictor(&global_predictor[1], "forwards next");
	describe_predictor(&global_predictor[2], "forwards end");
	describe_predictor(&global_predictor[3], "forwards setup");
	describe_predictor(&global_predictor[4], "forwards equal");
	describe_predictor(&global_predictor[5], "reverse middle");
	describe_predictor(&global_predictor[6], "reverse next");
	describe_predictor(&global_predictor[7], "reverse end");
	describe_predictor(&global_predictor[8], "reverse setup");
	describe_predictor(&global_predictor[9], "reverse equal");
	describe_predictor(&global_predictor[10], "insertion outer");
	describe_predictor(&global_predictor[11], "insertion inner");
	describe_predictor(&global_predictor[12], "insertion reverse outer");
	describe_predictor(&global_predictor[13], "insertion reverse inner");
#ifdef _USE_ROLLED_LOOPS

	describe_predictor(&global_predictor[14], "forwards left");
	describe_predictor(&global_predictor[15], "forwards right");
	describe_predictor(&global_predictor[16], "reverse left");
	describe_predictor(&global_predictor[17], "reverse right");
#else
	describe_predictor(&global_predictor[14], "forwards left 0");
	describe_predictor(&global_predictor[15], "forwards left 1");
	describe_predictor(&global_predictor[16], "forwards left 2");
	describe_predictor(&global_predictor[17], "forwards left 3");
	describe_predictor(&global_predictor[18], "forwards left 4");
	describe_predictor(&global_predictor[19], "forwards left 5");
	describe_predictor(&global_predictor[20], "forwards left 6");
	describe_predictor(&global_predictor[21], "forwards left 7");
	describe_predictor(&global_predictor[22], "forwards right 0");
	describe_predictor(&global_predictor[23], "forwards right 1");
	describe_predictor(&global_predictor[24], "forwards right 2");
	describe_predictor(&global_predictor[25], "forwards right 3");
	describe_predictor(&global_predictor[26], "forwards right 4");
	describe_predictor(&global_predictor[27], "forwards right 5");
	describe_predictor(&global_predictor[28], "forwards right 6");
	describe_predictor(&global_predictor[29], "forwards right 7");

	describe_predictor(&global_predictor[30], "reverse left 0");
	describe_predictor(&global_predictor[31], "reverse left 1");
	describe_predictor(&global_predictor[32], "reverse left 2");
	describe_predictor(&global_predictor[33], "reverse left 3");
	describe_predictor(&global_predictor[34], "reverse left 4");
	describe_predictor(&global_predictor[35], "reverse left 5");
	describe_predictor(&global_predictor[36], "reverse left 6");
	describe_predictor(&global_predictor[37], "reverse left 7");
	describe_predictor(&global_predictor[38], "reverse right 0");
	describe_predictor(&global_predictor[39], "reverse right 1");
	describe_predictor(&global_predictor[40], "reverse right 2");
	describe_predictor(&global_predictor[41], "reverse right 3");
	describe_predictor(&global_predictor[42], "reverse right 4");
	describe_predictor(&global_predictor[43], "reverse right 5");
	describe_predictor(&global_predictor[44], "reverse right 6");
	describe_predictor(&global_predictor[45], "reverse right 7");
#endif

	/* a quick explanation, cause I keep needing to be reminded how I did this:
	 * an address is split into 3 parts: the tag, the index and the offset.
	 * Suppose theres a 32 bit address, a 32 byte cache line and 65536 cache
	 * blocks, as in our tests. In this case, the 32 bit address is split into
	 * a 5 bit offset (2^5 = 32 byte cache line), a 16 bit index (2^16 = 65536
	 * cache blocks) and the rest is the tag.  Therefore, a and aux need to
	 * have exactly the opposite index. minusA is the index aux needs to have,
	 * which we mask in. If this results in an address lower than the one we
	 * started with (in aux), then thats out of bounds, and increase it by
	 * 65536.
	 */

	/* get the index we need*/
	minusA = get_index(a) ^ (1 << (BLOCK_BITS - 1));

	aux_data = memalign(ALIGNMENT, (N + 2*LIMIT) * sizeof(unsigned int));

	/* clear the index bits, and mask in the desired index */
	aux = (unsigned int*)(((unsigned int)aux_data & (~BLOCK_AND_LINE_MASK)) | (minusA << LINE_BITS));
	if (aux < aux_data) /* then the new index is less than the old one */
	{
		aux = (unsigned int*)((unsigned int)aux + (1 << (BLOCK_AND_LINE_BITS)));
	}



	if (N <= 2048) /* fits in the level 1 cache */
	{
		if (get_count(N) & 1) set_presort_count(ODD_COUNT);
		else set_presort_count(EVEN_COUNT);
			
		presort_flexible(a, N);
		merge(a, N, presort_count, aux);
		goto end;
	}

/*	OUT(get_count(N)); */
/*	OUT(N); */

	level2_count = N / LIMIT; /* the number of standard LIMIT sized passes */
	level1_count = LIMIT / 1024; /* the number of standard 4k sized passes, per level 2 iteration */
	extra_level2 = N % LIMIT; /* the number of extra items left, after the level 2 passes*/
	extra_level1_count = extra_level2 / 1024; /* number of extra level 1 passes  */
	final_extra = extra_level2 % 1024; /* number of items left over */

	level2_start = a;
	level2_aux_start = aux;

	/*make sure it ends up in a, not aux */
	/* odd means it should end up in aux. and the final merge will get it into a */
	/* even means it should end up in a, and the final mergre will do an even number of steps */

	/* obviously, we should take this out. but the final bit is quite complex,
	 * it doesnt come up in the tests (due to using powers of 2) and the extra
	 * code doesnt cause too much of a hit. More importantly, it doesnt need to
	 * be optimal, as we're only interested in data cache and brnahc
	 * predictors, so the fact that theres a few extra instructions to get
	 * loaded doesnt make a difference. */
	if (get_count(N) & 1)
	{
		level1_finish =  &level2_aux_start;
		level1_other = &level2_start;
		set_presort_count(ODD_COUNT);
		odd = 1;
	}
	else
	{
		level1_finish =  &level2_start;
		level1_other = &level2_aux_start;
		set_presort_count(EVEN_COUNT);
		odd = 0;
	}

/*	printf("a = %p\n", a); 
	printf("aux = %p\n", aux); */

	for(i = 0; i < level2_count-1; i+=2) /* sort it level 2 */
	{
/*		printf("going into level2: i=%d\n", i);  */

		/* merge them all into LIMIT sized bits */
		presort(level2_start, LIMIT);
		merge(level2_start, LIMIT, presort_count, level2_aux_start);

		level2_start += LIMIT;
		level2_aux_start += LIMIT;


		/* now do it in reverse */
		presort(level2_start, LIMIT);
		merge_reverse(level2_start, LIMIT, presort_count, level2_aux_start);

		level2_start += LIMIT;
		level2_aux_start += LIMIT;

	}
	if (i < level2_count)
	{
		/* merge them all into LIMIT sized bits */
		presort(level2_start, LIMIT);
		merge(level2_start, LIMIT, presort_count, level2_aux_start);

		level2_start += LIMIT;
		level2_aux_start += LIMIT;
	}


	/* this bit is too complicated to remove the 2 level tiling from. It doesnt
	 * alter the results either, since we use powers of two */

	/* sort the remaining bits */
	/* level2 start is in the right place */
/*	OUT(extra_level1_count); */
	if (extra_level2) /* there is a maximum of 1 extra tevel 2 sort */
	{
		int extra_level1_single = extra_level1_count & 0x1;
		unsigned int* level1_start = level2_start;
		unsigned int* level1_aux_start = level2_aux_start;
		extra_level1_count &= ~0x1; /*clear the last bit */
/*		OUT(extra_level2); */
/*		OUT(extra_level1_count); */
		for(j = 0; j < extra_level1_count; j+=2) /* merge the level 1 cache first */
		{
			presort(level1_start, 1024);
			merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */
			level1_start += 1024;
			level1_aux_start += 1024;

			/* now reverse it */

			presort(level1_start, 1024);
			merge_reverse(level1_start, 1024, presort_count, level1_aux_start);
			level1_start += 1024;
			level1_aux_start += 1024;
			/* these end up in aux */
		}
/*		OUT(extra_level1_single); */
		if (extra_level1_single)/* if there a full one left, its forward */
		{
			presort(level1_start, 1024);
			merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */
			level1_start += 1024;
			level1_aux_start += 1024;
		}
/*		OUT(final_extra); */

		if (final_extra) /* theres less than a full level1 sized chunk */
		{
			/* these will be sorted in one go */
			/* if it turns out the number should be 2049, I may need to change this */
			if (final_extra <= double_presort_count)
			{
				if (!odd) set_presort_count(ODD_COUNT);
			}
			else
			{
				if (get_count(final_extra) & 1)
				{
					if (odd) set_presort_count(EVEN_COUNT);
					else set_presort_count(ODD_COUNT);
				}
			}

			/* should this be reversed or not */
			if (!extra_level1_single) 
			{
				presort_flexible(level1_start, final_extra);
				merge(level1_start, final_extra, presort_count, level1_aux_start);
			}
			else 
			{
				presort_flexible_reverse(level1_start, final_extra);
				merge_reverse(level1_start, final_extra, presort_count, level1_aux_start);
			}
		}

		/* merge the whole extra into 1 */
		merge(*level1_finish, extra_level2, 1024, *level1_other);
	}
	
	/* now merge everything together */
	if (N > LIMIT)
	{
		if (odd) /* its in aux - we took steps to ensure it*/
		{
			merge(aux, N, LIMIT, a);
		}
		else
		{
			merge(a, N, LIMIT, aux);
		}
	}

end:

	free(aux_data);
	init_predictor(&global_predictor[0]);
	init_predictor(&global_predictor[1]);
	init_predictor(&global_predictor[2]);
	init_predictor(&global_predictor[3]);
	init_predictor(&global_predictor[4]);
	init_predictor(&global_predictor[5]);
	init_predictor(&global_predictor[6]);
	init_predictor(&global_predictor[7]);
	init_predictor(&global_predictor[8]);
	init_predictor(&global_predictor[9]);
	init_predictor(&global_predictor[10]);
	init_predictor(&global_predictor[11]);
	init_predictor(&global_predictor[12]);
	init_predictor(&global_predictor[13]);
}
void
algorithm_n(unsigned int a[], int N)
{
	unsigned int* aux = malloc(N * sizeof(unsigned int)); /* make it twice the size to use the notation */

	int s = 0; /* this picks which area we write to */
	
	int i,j; /* indices for the first array */
	int k, l; /* indices for second array */
	
	int d, f; /* d => direction, if (f == 0) keep going */

	unsigned int temp;

	unsigned int* source;
	unsigned int* target;

	describe_predictor(&global_predictor[0], "N2");
	describe_predictor(&global_predictor[1], "N3");
	describe_predictor(&global_predictor[2], "N3 i == j");
	describe_predictor(&global_predictor[3], "N5");
	describe_predictor(&global_predictor[4], "N7");
	describe_predictor(&global_predictor[5], "N9");
	describe_predictor(&global_predictor[6], "N11");
	describe_predictor(&global_predictor[7], "N13");

N2:	/* Prepare for pass */

	i = 0;
	j = N-1;
	k = 0;
	l = N-1;
	d = 1;
	f = 1;

	if (s == 0)
	{
		branch_taken(&global_predictor[0]);
		source = a;
		target = aux;
	}
	else
	{
		branch_not_taken(&global_predictor[0]);
		source = aux;
		target = a;
	}
	
/*	printf("2,3\n"); */

N3: /* compare Ki, Kj */

	if (source[i] > source[j])
	{
		branch_taken(&global_predictor[1]);
/*		printf("3,8\n"); */
		goto N8;
	}
	else branch_not_taken(&global_predictor[1]);

	if (i == j)
	{
		branch_taken(&global_predictor[2]);
		target[k] = source[i];
/*		printf("3,13\n"); */
		goto N13;
	}
	else branch_not_taken(&global_predictor[2]);

/*N4:  transmit Ri */

/*	printf("3,4\n"); */

	target[k] = source[i];
	k = k + d; /* increment in the correct direction */

/*N5: // Stepdown? */

	i++;
	if (source[i-1] <= source[i])
	{
		branch_taken(&global_predictor[3]);
/*		printf("4,3\n"); */
		goto N3;
	}
	else branch_not_taken(&global_predictor[3]);

/*	printf("4,6\n"); */
N6: 
	target[k] = source[j];
	k = k + d;

/*N7: // stepdown? */

	j--;
	if (source[j+1] <= source[j])
	{
		branch_taken(&global_predictor[4]);
/*		printf("6,6\n"); */
		goto N6;
	}
	else
	{
		branch_not_taken(&global_predictor[4]);
/*		printf("6,12\n"); */
		goto N12;
	}

N8: /* transmit Rj */

	target[k]  = source[j];
	k = k + d; /* increment in the correct direction */

/*N9: // Stepdown? */

	j--;
	if (source[j+1] <= source[j])
	{
		branch_taken(&global_predictor[5]);
/*		printf("8,3\n"); */
		goto N3;
	}
	else branch_not_taken(&global_predictor[5]);

/*	printf("8,10\n"); */
N10: /* transmit Ri */


	target[k] = source[i];
	k = k + d;

/*N11: // stepdown? */

	i++;
	if (source[i-1] <= source[i])
	{
		branch_taken(&global_predictor[6]);
/*		printf("10,10\n"); */
		goto N10;
	}
	else branch_not_taken(&global_predictor[6]);

/*	printf("10,12\n"); */
N12: /* switch sides (of the flow graph on page 162) */

	f = 0;
	d = -d; /* change the direction */
	temp = k;
	k = l;
	l = temp;

/*	printf("12,3\n"); */

	goto N3;
	

N13: /* switch areas */

	if (f == 0)
	{
		branch_taken(&global_predictor[7]);
		s = 1 - s; /* s = !s */
/*		printf("13,2\n"); */
		goto N2;
	}
	else /* sorting is complete */
	{
		branch_not_taken(&global_predictor[7]);
/*		printf("s = %d\n", s);
		exit(0); */ 
		if (s == 0)
		{
			memcpy(a, target, N * sizeof(unsigned int)); 
		}
	}

	free(aux);

	/* clear uninteresting predictors */
	init_predictor(&global_predictor[0]);
	init_predictor(&global_predictor[4]);
	init_predictor(&global_predictor[6]);
	init_predictor(&global_predictor[7]);

}
void
base_quicksort7(unsigned int a[], int N)
{
    int l, r;
    int i;
    int m;
    int il, ir; /* names follow pl, pm, and pn from bently/mcilroy. used ir instead of in */

    stackinit(N);

    describe_predictor(&global_predictor[0], "i");
    describe_predictor(&global_predictor[1], "j");
    describe_predictor(&global_predictor[2], "partition end");
    describe_predictor(&global_predictor[3], "insertion");
    describe_predictor(&global_predictor[4], "median");
    /*	describe_predictor(&global_predictor[4], "median of 7 ab"); */
    describe_predictor(&global_predictor[5], "median of 7 bc");
    describe_predictor(&global_predictor[6], "median of 7 ac");
    describe_predictor(&global_predictor[7], "median of 7 cb");
    describe_predictor(&global_predictor[8], "median of 7 ca");
    describe_predictor(&global_predictor[9], "median of 7 ab2");
    describe_predictor(&global_predictor[10], "median of 7 bc2");
    describe_predictor(&global_predictor[11], "median of 7 ac2");
    describe_predictor(&global_predictor[12], "median of 7 cb2");
    describe_predictor(&global_predictor[13], "median of 7 ca2");
    describe_predictor(&global_predictor[14], "median of 3 cmp1");
    describe_predictor(&global_predictor[15], "median of 3 cmp2");
    describe_predictor(&global_predictor[16], "median of 3 cmp3");


    r = N-1;
    l = 0;

    while(1)
    {
        int n = r - l;
        int n6 = n/6;
        int n3 = n/3;
        if (r - l <= THRESHHOLD)
        {
            if (stackempty())
                break;

            l = pop();
            r = pop();
            continue;
        }

        /* pseudo - Median of 7 partitioning*/
        m = (l+r)/2;
        if (n > 40)
        {

            il = med3(a, l, l + n6, l + n3);

            /* the 2 is for seperate branch predictors, as it's inlined */
            ir = med3_2(a, r - n3, r - n6, r);

            exch(a[l], a[il]);
            exch(a[r], a[ir]);
            exch(a[m], a[r-1]);
        }


        pred_compexch(a[l], a[r-1], 14);
        pred_compexch(a[l], a[r], 15);
        pred_compexch(a[r-1], a[r], 16);

        i = partition(a,l+1,r-1);

        /* here is the bug */
        /* then key is being copied more times than necessary. the reason for this is that it is not being removed when it is taken as the key */
        /* instead, it is being put in place more than once */
        /* example: i == 1, j == 10; key = a[1]; key < pivot, so key is swapped with a[2], the key is now in a[1] and a[2]. uh oh  */
        if (i-l > r-i)
        {
            push(i-1,l);
            l = i+1;
        }
        else
        {
            push(r,i+1);
            r = i-1;
        }
    }

    stackclear();
    /* the +1 isnt immediately obvious. its because THRESHHOLD is the difference between l and r up above */
    if (2*THRESHHOLD > N) insertion_sentinel(a,N);
    else insertion_sentinel(a,2*THRESHHOLD);

    insertion(a, N);

    /* add the predictors up */
    add_predictor(&global_predictor[4], &global_predictor[5]);
    add_predictor(&global_predictor[4], &global_predictor[6]);
    add_predictor(&global_predictor[4], &global_predictor[7]);
    add_predictor(&global_predictor[4], &global_predictor[8]);
    add_predictor(&global_predictor[4], &global_predictor[9]);
    add_predictor(&global_predictor[4], &global_predictor[10]);
    add_predictor(&global_predictor[4], &global_predictor[11]);
    add_predictor(&global_predictor[4], &global_predictor[12]);
    add_predictor(&global_predictor[4], &global_predictor[13]);
    add_predictor(&global_predictor[4], &global_predictor[14]);
    add_predictor(&global_predictor[4], &global_predictor[15]);
    add_predictor(&global_predictor[4], &global_predictor[16]);
    init_predictor(&global_predictor[5]);
    init_predictor(&global_predictor[6]);
    init_predictor(&global_predictor[7]);
    init_predictor(&global_predictor[8]);
    init_predictor(&global_predictor[9]);
    init_predictor(&global_predictor[10]);
    init_predictor(&global_predictor[11]);
    init_predictor(&global_predictor[12]);
    init_predictor(&global_predictor[13]);
    init_predictor(&global_predictor[14]);
    init_predictor(&global_predictor[15]);
    init_predictor(&global_predictor[16]);
}