int main (int argc, char * argv[]) { long mis_preds = 0; long num_branches = 0; uint32_t pc = 0; bool outcome = false; // Initialize the predictor init_predictor (); if (argc == 2) setup_trace (argv[1]); else setup_trace (NULL); // Read the number of instructions from the trace uint32_t stat_num_insts = 0; if (fread (&stat_num_insts, sizeof (uint32_t), 1, stream) != 1) { printf ("Could not read intput file\n"); return 1; } stat_num_insts = ntohl (stat_num_insts); // Read each branch from the trace while (read_branch (&pc, &outcome)) { pc = ntohl (pc); num_branches ++; // Make a prediction and compare with actual outcome if (make_prediction (pc) != outcome) mis_preds ++; // Train the predictor train_predictor (pc, outcome); } // Print out the mispredict statistics printf ("Branches\t\t%10d\n", num_branches); printf ("Incorrect\t\t%10d\n", mis_preds); float mis_pred_rate = 100*(float)mis_preds / float(num_branches); printf ("100*wrong_predicts/total branches is %8d / %8d = %7.3f\n", mis_preds, num_branches, mis_pred_rate); if (argc == 2) close_trace (); return 0; }
void tiled_mergesort(unsigned int a[], int N) { /* track keeps an eye on where i started last */ /* outer track keeps and eye on which cache_sized/2 segemnt i started in last */ int i,j; /* indices for the first array */ int level1_count; int level2_count; int extra_level1_count = 0; int extra_level2; int final_extra; unsigned int* aux_data; unsigned int* aux; unsigned int* level2_start; unsigned int* level2_aux_start; unsigned int minusA; unsigned int** level1_finish; unsigned int** level1_other; int odd = 0; describe_predictor(&global_predictor[0], "forwards middle"); describe_predictor(&global_predictor[1], "forwards next"); describe_predictor(&global_predictor[2], "forwards end"); describe_predictor(&global_predictor[3], "forwards setup"); describe_predictor(&global_predictor[4], "forwards equal"); describe_predictor(&global_predictor[5], "reverse middle"); describe_predictor(&global_predictor[6], "reverse next"); describe_predictor(&global_predictor[7], "reverse end"); describe_predictor(&global_predictor[8], "reverse setup"); describe_predictor(&global_predictor[9], "reverse equal"); describe_predictor(&global_predictor[10], "insertion outer"); describe_predictor(&global_predictor[11], "insertion inner"); describe_predictor(&global_predictor[12], "insertion reverse outer"); describe_predictor(&global_predictor[13], "insertion reverse inner"); #ifdef _USE_ROLLED_LOOPS describe_predictor(&global_predictor[14], "forwards left"); describe_predictor(&global_predictor[15], "forwards right"); describe_predictor(&global_predictor[16], "reverse left"); describe_predictor(&global_predictor[17], "reverse right"); #else describe_predictor(&global_predictor[14], "forwards left 0"); describe_predictor(&global_predictor[15], "forwards left 1"); describe_predictor(&global_predictor[16], "forwards left 2"); describe_predictor(&global_predictor[17], "forwards left 3"); describe_predictor(&global_predictor[18], "forwards left 4"); describe_predictor(&global_predictor[19], "forwards left 5"); describe_predictor(&global_predictor[20], "forwards left 6"); describe_predictor(&global_predictor[21], "forwards left 7"); describe_predictor(&global_predictor[22], "forwards right 0"); describe_predictor(&global_predictor[23], "forwards right 1"); describe_predictor(&global_predictor[24], "forwards right 2"); describe_predictor(&global_predictor[25], "forwards right 3"); describe_predictor(&global_predictor[26], "forwards right 4"); describe_predictor(&global_predictor[27], "forwards right 5"); describe_predictor(&global_predictor[28], "forwards right 6"); describe_predictor(&global_predictor[29], "forwards right 7"); describe_predictor(&global_predictor[30], "reverse left 0"); describe_predictor(&global_predictor[31], "reverse left 1"); describe_predictor(&global_predictor[32], "reverse left 2"); describe_predictor(&global_predictor[33], "reverse left 3"); describe_predictor(&global_predictor[34], "reverse left 4"); describe_predictor(&global_predictor[35], "reverse left 5"); describe_predictor(&global_predictor[36], "reverse left 6"); describe_predictor(&global_predictor[37], "reverse left 7"); describe_predictor(&global_predictor[38], "reverse right 0"); describe_predictor(&global_predictor[39], "reverse right 1"); describe_predictor(&global_predictor[40], "reverse right 2"); describe_predictor(&global_predictor[41], "reverse right 3"); describe_predictor(&global_predictor[42], "reverse right 4"); describe_predictor(&global_predictor[43], "reverse right 5"); describe_predictor(&global_predictor[44], "reverse right 6"); describe_predictor(&global_predictor[45], "reverse right 7"); #endif /* a quick explanation, cause I keep needing to be reminded how I did this: * an address is split into 3 parts: the tag, the index and the offset. * Suppose theres a 32 bit address, a 32 byte cache line and 65536 cache * blocks, as in our tests. In this case, the 32 bit address is split into * a 5 bit offset (2^5 = 32 byte cache line), a 16 bit index (2^16 = 65536 * cache blocks) and the rest is the tag. Therefore, a and aux need to * have exactly the opposite index. minusA is the index aux needs to have, * which we mask in. If this results in an address lower than the one we * started with (in aux), then thats out of bounds, and increase it by * 65536. */ /* get the index we need*/ minusA = get_index(a) ^ (1 << (BLOCK_BITS - 1)); aux_data = memalign(ALIGNMENT, (N + 2*LIMIT) * sizeof(unsigned int)); /* clear the index bits, and mask in the desired index */ aux = (unsigned int*)(((unsigned int)aux_data & (~BLOCK_AND_LINE_MASK)) | (minusA << LINE_BITS)); if (aux < aux_data) /* then the new index is less than the old one */ { aux = (unsigned int*)((unsigned int)aux + (1 << (BLOCK_AND_LINE_BITS))); } if (N <= 2048) /* fits in the level 1 cache */ { if (get_count(N) & 1) set_presort_count(ODD_COUNT); else set_presort_count(EVEN_COUNT); presort_flexible(a, N); merge(a, N, presort_count, aux); goto end; } /* OUT(get_count(N)); */ /* OUT(N); */ level2_count = N / LIMIT; /* the number of standard LIMIT sized passes */ level1_count = LIMIT / 1024; /* the number of standard 4k sized passes, per level 2 iteration */ extra_level2 = N % LIMIT; /* the number of extra items left, after the level 2 passes*/ extra_level1_count = extra_level2 / 1024; /* number of extra level 1 passes */ final_extra = extra_level2 % 1024; /* number of items left over */ level2_start = a; level2_aux_start = aux; /*make sure it ends up in a, not aux */ /* odd means it should end up in aux. and the final merge will get it into a */ /* even means it should end up in a, and the final mergre will do an even number of steps */ /* obviously, we should take this out. but the final bit is quite complex, * it doesnt come up in the tests (due to using powers of 2) and the extra * code doesnt cause too much of a hit. More importantly, it doesnt need to * be optimal, as we're only interested in data cache and brnahc * predictors, so the fact that theres a few extra instructions to get * loaded doesnt make a difference. */ if (get_count(N) & 1) { level1_finish = &level2_aux_start; level1_other = &level2_start; set_presort_count(ODD_COUNT); odd = 1; } else { level1_finish = &level2_start; level1_other = &level2_aux_start; set_presort_count(EVEN_COUNT); odd = 0; } /* printf("a = %p\n", a); printf("aux = %p\n", aux); */ for(i = 0; i < level2_count-1; i+=2) /* sort it level 2 */ { /* printf("going into level2: i=%d\n", i); */ /* merge them all into LIMIT sized bits */ presort(level2_start, LIMIT); merge(level2_start, LIMIT, presort_count, level2_aux_start); level2_start += LIMIT; level2_aux_start += LIMIT; /* now do it in reverse */ presort(level2_start, LIMIT); merge_reverse(level2_start, LIMIT, presort_count, level2_aux_start); level2_start += LIMIT; level2_aux_start += LIMIT; } if (i < level2_count) { /* merge them all into LIMIT sized bits */ presort(level2_start, LIMIT); merge(level2_start, LIMIT, presort_count, level2_aux_start); level2_start += LIMIT; level2_aux_start += LIMIT; } /* this bit is too complicated to remove the 2 level tiling from. It doesnt * alter the results either, since we use powers of two */ /* sort the remaining bits */ /* level2 start is in the right place */ /* OUT(extra_level1_count); */ if (extra_level2) /* there is a maximum of 1 extra tevel 2 sort */ { int extra_level1_single = extra_level1_count & 0x1; unsigned int* level1_start = level2_start; unsigned int* level1_aux_start = level2_aux_start; extra_level1_count &= ~0x1; /*clear the last bit */ /* OUT(extra_level2); */ /* OUT(extra_level1_count); */ for(j = 0; j < extra_level1_count; j+=2) /* merge the level 1 cache first */ { presort(level1_start, 1024); merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */ level1_start += 1024; level1_aux_start += 1024; /* now reverse it */ presort(level1_start, 1024); merge_reverse(level1_start, 1024, presort_count, level1_aux_start); level1_start += 1024; level1_aux_start += 1024; /* these end up in aux */ } /* OUT(extra_level1_single); */ if (extra_level1_single)/* if there a full one left, its forward */ { presort(level1_start, 1024); merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */ level1_start += 1024; level1_aux_start += 1024; } /* OUT(final_extra); */ if (final_extra) /* theres less than a full level1 sized chunk */ { /* these will be sorted in one go */ /* if it turns out the number should be 2049, I may need to change this */ if (final_extra <= double_presort_count) { if (!odd) set_presort_count(ODD_COUNT); } else { if (get_count(final_extra) & 1) { if (odd) set_presort_count(EVEN_COUNT); else set_presort_count(ODD_COUNT); } } /* should this be reversed or not */ if (!extra_level1_single) { presort_flexible(level1_start, final_extra); merge(level1_start, final_extra, presort_count, level1_aux_start); } else { presort_flexible_reverse(level1_start, final_extra); merge_reverse(level1_start, final_extra, presort_count, level1_aux_start); } } /* merge the whole extra into 1 */ merge(*level1_finish, extra_level2, 1024, *level1_other); } /* now merge everything together */ if (N > LIMIT) { if (odd) /* its in aux - we took steps to ensure it*/ { merge(aux, N, LIMIT, a); } else { merge(a, N, LIMIT, aux); } } end: free(aux_data); init_predictor(&global_predictor[0]); init_predictor(&global_predictor[1]); init_predictor(&global_predictor[2]); init_predictor(&global_predictor[3]); init_predictor(&global_predictor[4]); init_predictor(&global_predictor[5]); init_predictor(&global_predictor[6]); init_predictor(&global_predictor[7]); init_predictor(&global_predictor[8]); init_predictor(&global_predictor[9]); init_predictor(&global_predictor[10]); init_predictor(&global_predictor[11]); init_predictor(&global_predictor[12]); init_predictor(&global_predictor[13]); }
void algorithm_n(unsigned int a[], int N) { unsigned int* aux = malloc(N * sizeof(unsigned int)); /* make it twice the size to use the notation */ int s = 0; /* this picks which area we write to */ int i,j; /* indices for the first array */ int k, l; /* indices for second array */ int d, f; /* d => direction, if (f == 0) keep going */ unsigned int temp; unsigned int* source; unsigned int* target; describe_predictor(&global_predictor[0], "N2"); describe_predictor(&global_predictor[1], "N3"); describe_predictor(&global_predictor[2], "N3 i == j"); describe_predictor(&global_predictor[3], "N5"); describe_predictor(&global_predictor[4], "N7"); describe_predictor(&global_predictor[5], "N9"); describe_predictor(&global_predictor[6], "N11"); describe_predictor(&global_predictor[7], "N13"); N2: /* Prepare for pass */ i = 0; j = N-1; k = 0; l = N-1; d = 1; f = 1; if (s == 0) { branch_taken(&global_predictor[0]); source = a; target = aux; } else { branch_not_taken(&global_predictor[0]); source = aux; target = a; } /* printf("2,3\n"); */ N3: /* compare Ki, Kj */ if (source[i] > source[j]) { branch_taken(&global_predictor[1]); /* printf("3,8\n"); */ goto N8; } else branch_not_taken(&global_predictor[1]); if (i == j) { branch_taken(&global_predictor[2]); target[k] = source[i]; /* printf("3,13\n"); */ goto N13; } else branch_not_taken(&global_predictor[2]); /*N4: transmit Ri */ /* printf("3,4\n"); */ target[k] = source[i]; k = k + d; /* increment in the correct direction */ /*N5: // Stepdown? */ i++; if (source[i-1] <= source[i]) { branch_taken(&global_predictor[3]); /* printf("4,3\n"); */ goto N3; } else branch_not_taken(&global_predictor[3]); /* printf("4,6\n"); */ N6: target[k] = source[j]; k = k + d; /*N7: // stepdown? */ j--; if (source[j+1] <= source[j]) { branch_taken(&global_predictor[4]); /* printf("6,6\n"); */ goto N6; } else { branch_not_taken(&global_predictor[4]); /* printf("6,12\n"); */ goto N12; } N8: /* transmit Rj */ target[k] = source[j]; k = k + d; /* increment in the correct direction */ /*N9: // Stepdown? */ j--; if (source[j+1] <= source[j]) { branch_taken(&global_predictor[5]); /* printf("8,3\n"); */ goto N3; } else branch_not_taken(&global_predictor[5]); /* printf("8,10\n"); */ N10: /* transmit Ri */ target[k] = source[i]; k = k + d; /*N11: // stepdown? */ i++; if (source[i-1] <= source[i]) { branch_taken(&global_predictor[6]); /* printf("10,10\n"); */ goto N10; } else branch_not_taken(&global_predictor[6]); /* printf("10,12\n"); */ N12: /* switch sides (of the flow graph on page 162) */ f = 0; d = -d; /* change the direction */ temp = k; k = l; l = temp; /* printf("12,3\n"); */ goto N3; N13: /* switch areas */ if (f == 0) { branch_taken(&global_predictor[7]); s = 1 - s; /* s = !s */ /* printf("13,2\n"); */ goto N2; } else /* sorting is complete */ { branch_not_taken(&global_predictor[7]); /* printf("s = %d\n", s); exit(0); */ if (s == 0) { memcpy(a, target, N * sizeof(unsigned int)); } } free(aux); /* clear uninteresting predictors */ init_predictor(&global_predictor[0]); init_predictor(&global_predictor[4]); init_predictor(&global_predictor[6]); init_predictor(&global_predictor[7]); }
void base_quicksort7(unsigned int a[], int N) { int l, r; int i; int m; int il, ir; /* names follow pl, pm, and pn from bently/mcilroy. used ir instead of in */ stackinit(N); describe_predictor(&global_predictor[0], "i"); describe_predictor(&global_predictor[1], "j"); describe_predictor(&global_predictor[2], "partition end"); describe_predictor(&global_predictor[3], "insertion"); describe_predictor(&global_predictor[4], "median"); /* describe_predictor(&global_predictor[4], "median of 7 ab"); */ describe_predictor(&global_predictor[5], "median of 7 bc"); describe_predictor(&global_predictor[6], "median of 7 ac"); describe_predictor(&global_predictor[7], "median of 7 cb"); describe_predictor(&global_predictor[8], "median of 7 ca"); describe_predictor(&global_predictor[9], "median of 7 ab2"); describe_predictor(&global_predictor[10], "median of 7 bc2"); describe_predictor(&global_predictor[11], "median of 7 ac2"); describe_predictor(&global_predictor[12], "median of 7 cb2"); describe_predictor(&global_predictor[13], "median of 7 ca2"); describe_predictor(&global_predictor[14], "median of 3 cmp1"); describe_predictor(&global_predictor[15], "median of 3 cmp2"); describe_predictor(&global_predictor[16], "median of 3 cmp3"); r = N-1; l = 0; while(1) { int n = r - l; int n6 = n/6; int n3 = n/3; if (r - l <= THRESHHOLD) { if (stackempty()) break; l = pop(); r = pop(); continue; } /* pseudo - Median of 7 partitioning*/ m = (l+r)/2; if (n > 40) { il = med3(a, l, l + n6, l + n3); /* the 2 is for seperate branch predictors, as it's inlined */ ir = med3_2(a, r - n3, r - n6, r); exch(a[l], a[il]); exch(a[r], a[ir]); exch(a[m], a[r-1]); } pred_compexch(a[l], a[r-1], 14); pred_compexch(a[l], a[r], 15); pred_compexch(a[r-1], a[r], 16); i = partition(a,l+1,r-1); /* here is the bug */ /* then key is being copied more times than necessary. the reason for this is that it is not being removed when it is taken as the key */ /* instead, it is being put in place more than once */ /* example: i == 1, j == 10; key = a[1]; key < pivot, so key is swapped with a[2], the key is now in a[1] and a[2]. uh oh */ if (i-l > r-i) { push(i-1,l); l = i+1; } else { push(r,i+1); r = i-1; } } stackclear(); /* the +1 isnt immediately obvious. its because THRESHHOLD is the difference between l and r up above */ if (2*THRESHHOLD > N) insertion_sentinel(a,N); else insertion_sentinel(a,2*THRESHHOLD); insertion(a, N); /* add the predictors up */ add_predictor(&global_predictor[4], &global_predictor[5]); add_predictor(&global_predictor[4], &global_predictor[6]); add_predictor(&global_predictor[4], &global_predictor[7]); add_predictor(&global_predictor[4], &global_predictor[8]); add_predictor(&global_predictor[4], &global_predictor[9]); add_predictor(&global_predictor[4], &global_predictor[10]); add_predictor(&global_predictor[4], &global_predictor[11]); add_predictor(&global_predictor[4], &global_predictor[12]); add_predictor(&global_predictor[4], &global_predictor[13]); add_predictor(&global_predictor[4], &global_predictor[14]); add_predictor(&global_predictor[4], &global_predictor[15]); add_predictor(&global_predictor[4], &global_predictor[16]); init_predictor(&global_predictor[5]); init_predictor(&global_predictor[6]); init_predictor(&global_predictor[7]); init_predictor(&global_predictor[8]); init_predictor(&global_predictor[9]); init_predictor(&global_predictor[10]); init_predictor(&global_predictor[11]); init_predictor(&global_predictor[12]); init_predictor(&global_predictor[13]); init_predictor(&global_predictor[14]); init_predictor(&global_predictor[15]); init_predictor(&global_predictor[16]); }