Esempio n. 1
0
// Each thread starts here and performs 16 hashes simultaneously. With four
// threads, there are 64 hashes in flight at a time. Each thread repeats this
// four times.  The total number of hashes performed is 256.
int main()
{
	__builtin_nyuzi_write_control_reg(30, 0xffffffff);	// Start other threads

	const int kSourceBlockSize = 128;
	const int kHashSize = 32;
	const int kNumBuffers = 2;
	const int kNumLanes = 16;
	
	unsigned int basePtr = 0x100000 + __builtin_nyuzi_read_control_reg(0) * (kHashSize * kNumLanes * kNumBuffers)
		+ (kSourceBlockSize * kNumLanes);
	const vecu16_t kStepVector = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
	vecu16_t inputPtr = __builtin_nyuzi_makevectori(basePtr) + (kStepVector * __builtin_nyuzi_makevectori(kHashSize));
	vecu16_t tmpPtr = inputPtr + __builtin_nyuzi_makevectori(kSourceBlockSize * kNumLanes);
	vecu16_t outputPtr = tmpPtr + __builtin_nyuzi_makevectori(kHashSize * kNumLanes);

	for (int i = 0; i < 4; i++)
	{
		// Double sha-2 hash
		sha2Hash(inputPtr, kSourceBlockSize / kHashSize, outputPtr);
		sha2Hash(tmpPtr, 1, outputPtr);
	}
	
	return 0;
}
Esempio n. 2
0
void copyTest()
{
	veci16_t *dest = (veci16_t*) region1Base + currentThread() * LOOP_UNROLL;
	veci16_t *src = (veci16_t*) region2Base + currentThread() * LOOP_UNROLL;
	veci16_t values = __builtin_nyuzi_makevectori(0xdeadbeef);
	int transferCount = kTransferSize / (64 * NUM_THREADS * LOOP_UNROLL);
	int unrollCount;

	int startTime = getTime();
	startParallel();
	do
	{
		// The compiler will automatically unroll this
		for (unrollCount = 0; unrollCount < LOOP_UNROLL; unrollCount++)
			dest[unrollCount] = src[unrollCount];

		dest += NUM_THREADS * LOOP_UNROLL;
		src += NUM_THREADS * LOOP_UNROLL;
	}
	while (--transferCount);
	endParallel();
	if (currentThread() == 0)
	{
		int endTime = getTime();
		printf("copy: %g bytes/cycle\n", (float) kTransferSize / (endTime - startTime));
	}
}
Esempio n. 3
0
int main()
{
	veci16 *dest = (veci16*) region1Base + __builtin_nyuzi_read_control_reg(0) * LOOP_UNROLL;
	veci16 values = __builtin_nyuzi_makevectori(0xdeadbeef);
	
	int transferCount = kTransferSize / (64 * NUM_STRANDS * LOOP_UNROLL);
	do
	{
		dest[0] = values;
		dest[1] = values;
		dest[2] = values;
		dest[3] = values;
		dest[4] = values;
		dest[5] = values;
		dest[6] = values;
		dest[7] = values;
		dest += NUM_STRANDS * LOOP_UNROLL;
	}
	while (--transferCount);
}
Esempio n. 4
0
void* memset(void *_dest, int value, size_t length)
{
	char *dest = (char*) _dest;
	value &= 0xff;

	// XXX Possibly fill bytes/words until alignment is hit

	if ((((unsigned int) dest) & 63) == 0)
	{
		// Write 64 bytes at a time.
		veci16_t reallyWideValue = __builtin_nyuzi_makevectori(value | (value << 8) | (value << 16) 
			| (value << 24));
		while (length > 64)
		{
			*((veci16_t*) dest) = reallyWideValue;
			length -= 64;
			dest += 64;
		}
	}

	if ((((unsigned int) dest) & 3) == 0)
	{
		// Write 4 bytes at a time.
		unsigned wideVal = value | (value << 8) | (value << 16) | (value << 24);
		while (length > 4)
		{
			*((unsigned int*) dest) = wideVal;
			dest += 4;
			length -= 4;
		}		
	}

	// Write one byte at a time
	while (length > 0)
	{
		*dest++ = value;
		length--;
	}
	
	return _dest;
}
Esempio n. 5
0
int main(void)
{
    veci16_t pointers = { &foo, &foo, &foo, &foo, &foo, &foo, 0x17,  &foo, &foo, &foo, &foo, &foo, &foo, &foo, &foo, &foo };

    __builtin_nyuzi_write_control_reg(CR_FAULT_HANDLER, (unsigned int) faultHandler);
    __builtin_nyuzi_write_control_reg(CR_TLB_MISS_HANDLER, (unsigned int) tlb_miss_handler);
    __builtin_nyuzi_write_control_reg(CR_FLAGS, FLAG_MMU_EN | FLAG_SUPERVISOR_EN);

    // This ensures the libc functions are mapped into the TLB so we don't generate
    // multiple TLB misses in the fault handler (doesn't break the test, just makes
    // debugging cleaner)
    printf("Starting test %d\n", 12);

    // This will cause an alignment fault on the 6th lane and jump to 'faultHandler'.
    // Use scatter store rather than a normal scalar store to ensure the
    // subcycle counter is saved correctly.
    __builtin_nyuzi_scatter_storei(pointers, __builtin_nyuzi_makevectori(0));

    printf("should_not_be_here\n"); // CHECKN: should_not_be_here

    return 0;
}
Esempio n. 6
0
// Run 16 parallel hashes
void sha2Hash(vecu16_t pointers, int totalBlocks, vecu16_t outHashes)
{
	// Initial H values
	vecu16_t h0 = __builtin_nyuzi_makevectori(0x6A09E667);
	vecu16_t h1 = __builtin_nyuzi_makevectori(0xBB67AE85);
	vecu16_t h2 = __builtin_nyuzi_makevectori(0x3C6EF372);
	vecu16_t h3 = __builtin_nyuzi_makevectori(0xA54FF53A);
	vecu16_t h4 = __builtin_nyuzi_makevectori(0x510E527F);
	vecu16_t h5 = __builtin_nyuzi_makevectori(0x9B05688C);
	vecu16_t h6 = __builtin_nyuzi_makevectori(0x1F83D9AB);
	vecu16_t h7 = __builtin_nyuzi_makevectori(0x5BE0CD19);

	for (int i = 0; i < totalBlocks; i++)
	{
		vecu16_t w[64];
		for (int index = 0; index < 16; index++)
		{
			w[index] = __builtin_nyuzi_gather_loadi(pointers);
			pointers += __builtin_nyuzi_makevectori(4);
		}
	
		for (int index = 16; index < 64; index++)
	  		w[index] = SIG1(w[index - 2]) + w[index - 7] + SIG0(w[index - 15]) + w[index - 16];

		vecu16_t a = h0;
		vecu16_t b = h1;
		vecu16_t c = h2;
		vecu16_t d = h3;
		vecu16_t e = h4;
		vecu16_t f = h5;
		vecu16_t g = h6;
		vecu16_t h = h7;
	
		for (int round = 0; round < 64; round++)
		{
			vecu16_t temp1 = h + SIG1(e) + CH(e, f, g) + __builtin_nyuzi_makevectori(K[round]) + w[round];
			vecu16_t temp2 = SIG0(a) + MA(a, b, c);
			h = g;
			g = f;
			f = e;
			e = d + temp1;
			d = c;
			c = b;
			b = a;
			a = temp1 + temp2;
		}
		
		h0 += a;
		h1 += b;
		h2 += c;
		h3 += d;
		h4 += e;
		h5 += f;
		h6 += g;
		h7 += h;
	}

	// doesn't add padding or length fields to end...
	
	__builtin_nyuzi_scatter_storei(outHashes, h0);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(4), h1);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(8), h2);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(12), h3);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(16), h4);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(20), h5);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(24), h6);
	__builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(28), h7);
}
Esempio n. 7
0
inline vecu16_t ROTR(vecu16_t x, int y)
{
	return (x >> __builtin_nyuzi_makevectori(y)) | (x << (__builtin_nyuzi_makevectori(32 - y)));
}