Ejemplo n.º 1
0
static void mincross_clust(Agraph_t *ug)
{
	Agraph_t	*g;
	g = GD_model(ug);
	if (run(g)) {
		presort(ug);		/* move the external nodes */
		subclustports(ug);
		do {
			mincross_sweep(g,GD_pass(g)%2,GD_pass(g)%4<2);
		} while (run(g));
		transpose_sweep(g,TRUE);
		restorebest(g);
	}
}
void
tiled_mergesort(unsigned int a[], int N)
{
	/* track keeps an eye on where i started last */
	/* outer track keeps and eye on which cache_sized/2 segemnt i started in last */
	int i,j; /* indices for the first array */
	
	int level1_count;
	int level2_count;
	int extra_level1_count = 0;
	int extra_level2;
	int final_extra;


	unsigned int* aux_data;
	unsigned int* aux;

	unsigned int* level2_start;
	unsigned int* level2_aux_start;

	unsigned int minusA;

	unsigned int** level1_finish;
	unsigned int** level1_other;

	int odd = 0;

	describe_predictor(&global_predictor[0], "forwards middle");
	describe_predictor(&global_predictor[1], "forwards next");
	describe_predictor(&global_predictor[2], "forwards end");
	describe_predictor(&global_predictor[3], "forwards setup");
	describe_predictor(&global_predictor[4], "forwards equal");
	describe_predictor(&global_predictor[5], "reverse middle");
	describe_predictor(&global_predictor[6], "reverse next");
	describe_predictor(&global_predictor[7], "reverse end");
	describe_predictor(&global_predictor[8], "reverse setup");
	describe_predictor(&global_predictor[9], "reverse equal");
	describe_predictor(&global_predictor[10], "insertion outer");
	describe_predictor(&global_predictor[11], "insertion inner");
	describe_predictor(&global_predictor[12], "insertion reverse outer");
	describe_predictor(&global_predictor[13], "insertion reverse inner");
#ifdef _USE_ROLLED_LOOPS

	describe_predictor(&global_predictor[14], "forwards left");
	describe_predictor(&global_predictor[15], "forwards right");
	describe_predictor(&global_predictor[16], "reverse left");
	describe_predictor(&global_predictor[17], "reverse right");
#else
	describe_predictor(&global_predictor[14], "forwards left 0");
	describe_predictor(&global_predictor[15], "forwards left 1");
	describe_predictor(&global_predictor[16], "forwards left 2");
	describe_predictor(&global_predictor[17], "forwards left 3");
	describe_predictor(&global_predictor[18], "forwards left 4");
	describe_predictor(&global_predictor[19], "forwards left 5");
	describe_predictor(&global_predictor[20], "forwards left 6");
	describe_predictor(&global_predictor[21], "forwards left 7");
	describe_predictor(&global_predictor[22], "forwards right 0");
	describe_predictor(&global_predictor[23], "forwards right 1");
	describe_predictor(&global_predictor[24], "forwards right 2");
	describe_predictor(&global_predictor[25], "forwards right 3");
	describe_predictor(&global_predictor[26], "forwards right 4");
	describe_predictor(&global_predictor[27], "forwards right 5");
	describe_predictor(&global_predictor[28], "forwards right 6");
	describe_predictor(&global_predictor[29], "forwards right 7");

	describe_predictor(&global_predictor[30], "reverse left 0");
	describe_predictor(&global_predictor[31], "reverse left 1");
	describe_predictor(&global_predictor[32], "reverse left 2");
	describe_predictor(&global_predictor[33], "reverse left 3");
	describe_predictor(&global_predictor[34], "reverse left 4");
	describe_predictor(&global_predictor[35], "reverse left 5");
	describe_predictor(&global_predictor[36], "reverse left 6");
	describe_predictor(&global_predictor[37], "reverse left 7");
	describe_predictor(&global_predictor[38], "reverse right 0");
	describe_predictor(&global_predictor[39], "reverse right 1");
	describe_predictor(&global_predictor[40], "reverse right 2");
	describe_predictor(&global_predictor[41], "reverse right 3");
	describe_predictor(&global_predictor[42], "reverse right 4");
	describe_predictor(&global_predictor[43], "reverse right 5");
	describe_predictor(&global_predictor[44], "reverse right 6");
	describe_predictor(&global_predictor[45], "reverse right 7");
#endif

	/* a quick explanation, cause I keep needing to be reminded how I did this:
	 * an address is split into 3 parts: the tag, the index and the offset.
	 * Suppose theres a 32 bit address, a 32 byte cache line and 65536 cache
	 * blocks, as in our tests. In this case, the 32 bit address is split into
	 * a 5 bit offset (2^5 = 32 byte cache line), a 16 bit index (2^16 = 65536
	 * cache blocks) and the rest is the tag.  Therefore, a and aux need to
	 * have exactly the opposite index. minusA is the index aux needs to have,
	 * which we mask in. If this results in an address lower than the one we
	 * started with (in aux), then thats out of bounds, and increase it by
	 * 65536.
	 */

	/* get the index we need*/
	minusA = get_index(a) ^ (1 << (BLOCK_BITS - 1));

	aux_data = memalign(ALIGNMENT, (N + 2*LIMIT) * sizeof(unsigned int));

	/* clear the index bits, and mask in the desired index */
	aux = (unsigned int*)(((unsigned int)aux_data & (~BLOCK_AND_LINE_MASK)) | (minusA << LINE_BITS));
	if (aux < aux_data) /* then the new index is less than the old one */
	{
		aux = (unsigned int*)((unsigned int)aux + (1 << (BLOCK_AND_LINE_BITS)));
	}



	if (N <= 2048) /* fits in the level 1 cache */
	{
		if (get_count(N) & 1) set_presort_count(ODD_COUNT);
		else set_presort_count(EVEN_COUNT);
			
		presort_flexible(a, N);
		merge(a, N, presort_count, aux);
		goto end;
	}

/*	OUT(get_count(N)); */
/*	OUT(N); */

	level2_count = N / LIMIT; /* the number of standard LIMIT sized passes */
	level1_count = LIMIT / 1024; /* the number of standard 4k sized passes, per level 2 iteration */
	extra_level2 = N % LIMIT; /* the number of extra items left, after the level 2 passes*/
	extra_level1_count = extra_level2 / 1024; /* number of extra level 1 passes  */
	final_extra = extra_level2 % 1024; /* number of items left over */

	level2_start = a;
	level2_aux_start = aux;

	/*make sure it ends up in a, not aux */
	/* odd means it should end up in aux. and the final merge will get it into a */
	/* even means it should end up in a, and the final mergre will do an even number of steps */

	/* obviously, we should take this out. but the final bit is quite complex,
	 * it doesnt come up in the tests (due to using powers of 2) and the extra
	 * code doesnt cause too much of a hit. More importantly, it doesnt need to
	 * be optimal, as we're only interested in data cache and brnahc
	 * predictors, so the fact that theres a few extra instructions to get
	 * loaded doesnt make a difference. */
	if (get_count(N) & 1)
	{
		level1_finish =  &level2_aux_start;
		level1_other = &level2_start;
		set_presort_count(ODD_COUNT);
		odd = 1;
	}
	else
	{
		level1_finish =  &level2_start;
		level1_other = &level2_aux_start;
		set_presort_count(EVEN_COUNT);
		odd = 0;
	}

/*	printf("a = %p\n", a); 
	printf("aux = %p\n", aux); */

	for(i = 0; i < level2_count-1; i+=2) /* sort it level 2 */
	{
/*		printf("going into level2: i=%d\n", i);  */

		/* merge them all into LIMIT sized bits */
		presort(level2_start, LIMIT);
		merge(level2_start, LIMIT, presort_count, level2_aux_start);

		level2_start += LIMIT;
		level2_aux_start += LIMIT;


		/* now do it in reverse */
		presort(level2_start, LIMIT);
		merge_reverse(level2_start, LIMIT, presort_count, level2_aux_start);

		level2_start += LIMIT;
		level2_aux_start += LIMIT;

	}
	if (i < level2_count)
	{
		/* merge them all into LIMIT sized bits */
		presort(level2_start, LIMIT);
		merge(level2_start, LIMIT, presort_count, level2_aux_start);

		level2_start += LIMIT;
		level2_aux_start += LIMIT;
	}


	/* this bit is too complicated to remove the 2 level tiling from. It doesnt
	 * alter the results either, since we use powers of two */

	/* sort the remaining bits */
	/* level2 start is in the right place */
/*	OUT(extra_level1_count); */
	if (extra_level2) /* there is a maximum of 1 extra tevel 2 sort */
	{
		int extra_level1_single = extra_level1_count & 0x1;
		unsigned int* level1_start = level2_start;
		unsigned int* level1_aux_start = level2_aux_start;
		extra_level1_count &= ~0x1; /*clear the last bit */
/*		OUT(extra_level2); */
/*		OUT(extra_level1_count); */
		for(j = 0; j < extra_level1_count; j+=2) /* merge the level 1 cache first */
		{
			presort(level1_start, 1024);
			merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */
			level1_start += 1024;
			level1_aux_start += 1024;

			/* now reverse it */

			presort(level1_start, 1024);
			merge_reverse(level1_start, 1024, presort_count, level1_aux_start);
			level1_start += 1024;
			level1_aux_start += 1024;
			/* these end up in aux */
		}
/*		OUT(extra_level1_single); */
		if (extra_level1_single)/* if there a full one left, its forward */
		{
			presort(level1_start, 1024);
			merge(level1_start, 1024, presort_count, level1_aux_start); /* after this they end up in aux */
			level1_start += 1024;
			level1_aux_start += 1024;
		}
/*		OUT(final_extra); */

		if (final_extra) /* theres less than a full level1 sized chunk */
		{
			/* these will be sorted in one go */
			/* if it turns out the number should be 2049, I may need to change this */
			if (final_extra <= double_presort_count)
			{
				if (!odd) set_presort_count(ODD_COUNT);
			}
			else
			{
				if (get_count(final_extra) & 1)
				{
					if (odd) set_presort_count(EVEN_COUNT);
					else set_presort_count(ODD_COUNT);
				}
			}

			/* should this be reversed or not */
			if (!extra_level1_single) 
			{
				presort_flexible(level1_start, final_extra);
				merge(level1_start, final_extra, presort_count, level1_aux_start);
			}
			else 
			{
				presort_flexible_reverse(level1_start, final_extra);
				merge_reverse(level1_start, final_extra, presort_count, level1_aux_start);
			}
		}

		/* merge the whole extra into 1 */
		merge(*level1_finish, extra_level2, 1024, *level1_other);
	}
	
	/* now merge everything together */
	if (N > LIMIT)
	{
		if (odd) /* its in aux - we took steps to ensure it*/
		{
			merge(aux, N, LIMIT, a);
		}
		else
		{
			merge(a, N, LIMIT, aux);
		}
	}

end:

	free(aux_data);
	init_predictor(&global_predictor[0]);
	init_predictor(&global_predictor[1]);
	init_predictor(&global_predictor[2]);
	init_predictor(&global_predictor[3]);
	init_predictor(&global_predictor[4]);
	init_predictor(&global_predictor[5]);
	init_predictor(&global_predictor[6]);
	init_predictor(&global_predictor[7]);
	init_predictor(&global_predictor[8]);
	init_predictor(&global_predictor[9]);
	init_predictor(&global_predictor[10]);
	init_predictor(&global_predictor[11]);
	init_predictor(&global_predictor[12]);
	init_predictor(&global_predictor[13]);
}