static void mincross_clust(Agraph_t *ug) { Agraph_t *g; g = GD_model(ug); if (run(g)) { presort(ug); /* move the external nodes */ subclustports(ug); do { mincross_sweep(g,GD_pass(g)%2,GD_pass(g)%4<2); } while (run(g)); transpose_sweep(g,TRUE); restorebest(g); } }
void tiled_mergesort(unsigned int a[], int N) { /* track keeps an eye on where i started last */ /* outer track keeps and eye on which cache_sized/2 segemnt i started in last */ int i,j; /* indices for the first array */ int level1_count; int level2_count; int extra_level1_count = 0; int extra_level2; int final_extra; unsigned int* aux_data; unsigned int* aux; unsigned int* level2_start; unsigned int* level2_aux_start; unsigned int minusA; unsigned int** level1_finish; unsigned int** level1_other; int odd = 0; describe_predictor(&global_predictor[0], "forwards middle"); describe_predictor(&global_predictor[1], "forwards next"); describe_predictor(&global_predictor[2], "forwards end"); describe_predictor(&global_predictor[3], "forwards setup"); describe_predictor(&global_predictor[4], "forwards equal"); describe_predictor(&global_predictor[5], "reverse middle"); describe_predictor(&global_predictor[6], "reverse next"); describe_predictor(&global_predictor[7], "reverse end"); describe_predictor(&global_predictor[8], "reverse setup"); describe_predictor(&global_predictor[9], "reverse equal"); describe_predictor(&global_predictor[10], "insertion outer"); describe_predictor(&global_predictor[11], "insertion inner"); describe_predictor(&global_predictor[12], "insertion reverse outer"); describe_predictor(&global_predictor[13], "insertion reverse inner"); #ifdef _USE_ROLLED_LOOPS describe_predictor(&global_predictor[14], "forwards left"); describe_predictor(&global_predictor[15], "forwards right"); describe_predictor(&global_predictor[16], "reverse left"); describe_predictor(&global_predictor[17], "reverse right"); #else describe_predictor(&global_predictor[14], "forwards left 0"); describe_predictor(&global_predictor[15], "forwards left 1"); describe_predictor(&global_predictor[16], "forwards left 2"); describe_predictor(&global_predictor[17], "forwards left 3"); describe_predictor(&global_predictor[18], "forwards left 4"); describe_predictor(&global_predictor[19], "forwards left 5"); describe_predictor(&global_predictor[20], "forwards left 6"); describe_predictor(&global_predictor[21], "forwards left 7"); describe_predictor(&global_predictor[22], "forwards right 0"); describe_predictor(&global_predictor[23], "forwards right 1"); describe_predictor(&global_predictor[24], "forwards right 2"); describe_predictor(&global_predictor[25], "forwards right 3"); describe_predictor(&global_predictor[26], "forwards right 4"); describe_predictor(&global_predictor[27], "forwards right 5"); describe_predictor(&global_predictor[28], "forwards right 6"); describe_predictor(&global_predictor[29], "forwards right 7"); describe_predictor(&global_predictor[30], "reverse left 0"); describe_predictor(&global_predictor[31], "reverse left 1"); describe_predictor(&global_predictor[32], "reverse left 2"); describe_predictor(&global_predictor[33], "reverse left 3"); describe_predictor(&global_predictor[34], "reverse left 4"); describe_predictor(&global_predictor[35], "reverse left 5"); describe_predictor(&global_predictor[36], "reverse left 6"); describe_predictor(&global_predictor[37], "reverse left 7"); describe_predictor(&global_predictor[38], "reverse right 0"); describe_predictor(&global_predictor[39], "reverse right 1"); describe_predictor(&global_predictor[40], "reverse right 2"); describe_predictor(&global_predictor[41], "reverse right 3"); describe_predictor(&global_predictor[42], "reverse right 4"); describe_predictor(&global_predictor[43], "reverse right 5"); describe_predictor(&global_predictor[44], "reverse right 6"); describe_predictor(&global_predictor[45], "reverse right 7"); #endif /* a quick explanation, cause I keep needing to be reminded how I did this: * an address is split into 3 parts: the tag, the index and the offset. * Suppose theres a 32 bit address, a 32 byte cache line and 65536 cache * blocks, as in our tests. In this case, the 32 bit address is split into * a 5 bit offset (2^5 = 32 byte cache line), a 16 bit index (2^16 = 65536 * cache blocks) and the rest is the tag. Therefore, a and aux need to * have exactly the opposite index. minusA is the index aux needs to have, * which we mask in. If this results in an address lower than the one we * started with (in aux), then thats out of bounds, and increase it by * 65536. */ /* get the index we need*/ minusA = get_index(a) ^ (1 << (BLOCK_BITS - 1)); aux_data = memalign(ALIGNMENT, (N + 2*LIMIT) * sizeof(unsigned int)); /* clear the index bits, and mask in the desired index */ aux = (unsigned int*)(((unsigned int)aux_data & (~BLOCK_AND_LINE_MASK)) | (minusA << LINE_BITS)); if (aux < aux_data) /* then the new index is less than the old one */ { aux = (unsigned int*)((unsigned int)aux + (1 << (BLOCK_AND_LINE_BITS))); } if (N <= 2048) /* fits in the level 1 cache */ { if (get_count(N) & 1) set_presort_count(ODD_COUNT); else set_presort_count(EVEN_COUNT); presort_flexible(a, N); merge(a, N, presort_count, aux); goto end; } /* OUT(get_count(N)); */ /* OUT(N); */ level2_count = N / LIMIT; /* the number of standard LIMIT sized passes */ level1_count = LIMIT / 1024; /* the number of standard 4k sized passes, per level 2 iteration */ extra_level2 = N % LIMIT; /* the number of extra items left, after the level 2 passes*/ extra_level1_count = extra_level2 / 1024; /* number of extra level 1 passes */ final_extra = extra_level2 % 1024; /* number of items left over */ level2_start = a; level2_aux_start = aux; /*make sure it ends up in a, not aux */ /* odd means it should end up in aux. and the final merge will get it into a */ /* even means it should end up in a, and the final mergre will do an even number of steps */ /* obviously, we should take this out. but the final bit is quite complex, * it doesnt come up in the tests (due to using powers of 2) and the extra * code doesnt cause too much of a hit. More importantly, it doesnt need to * be optimal, as we're only interested in data cache and brnahc * predictors, so the fact that theres a few extra instructions to get * loaded doesnt make a difference. */ if (get_count(N) & 1) { level1_finish = &level2_aux_start; level1_other = &level2_start; set_presort_count(ODD_COUNT); odd = 1; } else { level1_finish = &level2_start; level1_other = &level2_aux_start; set_presort_count(EVEN_COUNT); odd = 0; } /* printf("a = %p\n", a); printf("aux = %p\n", aux); */ for(i = 0; i < level2_count-1; i+=2) /* sort it level 2 */ { /* printf("going into level2: i=%d\n", i); */ /* merge them all into LIMIT sized bits */ presort(level2_start, LIMIT); merge(level2_start, LIMIT, presort_count, level2_aux_start); level2_start += LIMIT; level2_aux_start += LIMIT; /* now do it in reverse */ presort(level2_start, LIMIT); merge_reverse(level2_start, LIMIT, presort_count, level2_aux_start); level2_start += LIMIT; level2_aux_start += LIMIT; } if (i < level2_count) { /* merge them all into LIMIT sized bits */ presort(level2_start, LIMIT); merge(level2_start, LIMIT, presort_count, level2_aux_start); level2_start += LIMIT; level2_aux_start += LIMIT; } /* this bit is too complicated to remove the 2 level tiling from. It doesnt * alter the results either, since we use powers of two */ /* sort the remaining bits */ /* level2 start is in the right place */ /* OUT(extra_level1_count); */ if (extra_level2) /* there is a maximum of 1 extra tevel 2 sort */ { int extra_level1_single = extra_level1_count & 0x1; unsigned int* level1_start = level2_start; unsigned int* level1_aux_start = level2_aux_start; extra_level1_count &= ~0x1; /*clear the last bit */ /* OUT(extra_level2); */ /* OUT(extra_level1_count); */ for(j = 0; j < extra_level1_count; j+=2) /* merge the level 1 cache first */ { presort(level1_start, 1024); merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */ level1_start += 1024; level1_aux_start += 1024; /* now reverse it */ presort(level1_start, 1024); merge_reverse(level1_start, 1024, presort_count, level1_aux_start); level1_start += 1024; level1_aux_start += 1024; /* these end up in aux */ } /* OUT(extra_level1_single); */ if (extra_level1_single)/* if there a full one left, its forward */ { presort(level1_start, 1024); merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */ level1_start += 1024; level1_aux_start += 1024; } /* OUT(final_extra); */ if (final_extra) /* theres less than a full level1 sized chunk */ { /* these will be sorted in one go */ /* if it turns out the number should be 2049, I may need to change this */ if (final_extra <= double_presort_count) { if (!odd) set_presort_count(ODD_COUNT); } else { if (get_count(final_extra) & 1) { if (odd) set_presort_count(EVEN_COUNT); else set_presort_count(ODD_COUNT); } } /* should this be reversed or not */ if (!extra_level1_single) { presort_flexible(level1_start, final_extra); merge(level1_start, final_extra, presort_count, level1_aux_start); } else { presort_flexible_reverse(level1_start, final_extra); merge_reverse(level1_start, final_extra, presort_count, level1_aux_start); } } /* merge the whole extra into 1 */ merge(*level1_finish, extra_level2, 1024, *level1_other); } /* now merge everything together */ if (N > LIMIT) { if (odd) /* its in aux - we took steps to ensure it*/ { merge(aux, N, LIMIT, a); } else { merge(a, N, LIMIT, aux); } } end: free(aux_data); init_predictor(&global_predictor[0]); init_predictor(&global_predictor[1]); init_predictor(&global_predictor[2]); init_predictor(&global_predictor[3]); init_predictor(&global_predictor[4]); init_predictor(&global_predictor[5]); init_predictor(&global_predictor[6]); init_predictor(&global_predictor[7]); init_predictor(&global_predictor[8]); init_predictor(&global_predictor[9]); init_predictor(&global_predictor[10]); init_predictor(&global_predictor[11]); init_predictor(&global_predictor[12]); init_predictor(&global_predictor[13]); }