// Each thread starts here and performs 16 hashes simultaneously. With four // threads, there are 64 hashes in flight at a time. Each thread repeats this // four times. The total number of hashes performed is 256. int main() { __builtin_nyuzi_write_control_reg(30, 0xffffffff); // Start other threads const int kSourceBlockSize = 128; const int kHashSize = 32; const int kNumBuffers = 2; const int kNumLanes = 16; unsigned int basePtr = 0x100000 + __builtin_nyuzi_read_control_reg(0) * (kHashSize * kNumLanes * kNumBuffers) + (kSourceBlockSize * kNumLanes); const vecu16_t kStepVector = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; vecu16_t inputPtr = __builtin_nyuzi_makevectori(basePtr) + (kStepVector * __builtin_nyuzi_makevectori(kHashSize)); vecu16_t tmpPtr = inputPtr + __builtin_nyuzi_makevectori(kSourceBlockSize * kNumLanes); vecu16_t outputPtr = tmpPtr + __builtin_nyuzi_makevectori(kHashSize * kNumLanes); for (int i = 0; i < 4; i++) { // Double sha-2 hash sha2Hash(inputPtr, kSourceBlockSize / kHashSize, outputPtr); sha2Hash(tmpPtr, 1, outputPtr); } return 0; }
void copyTest() { veci16_t *dest = (veci16_t*) region1Base + currentThread() * LOOP_UNROLL; veci16_t *src = (veci16_t*) region2Base + currentThread() * LOOP_UNROLL; veci16_t values = __builtin_nyuzi_makevectori(0xdeadbeef); int transferCount = kTransferSize / (64 * NUM_THREADS * LOOP_UNROLL); int unrollCount; int startTime = getTime(); startParallel(); do { // The compiler will automatically unroll this for (unrollCount = 0; unrollCount < LOOP_UNROLL; unrollCount++) dest[unrollCount] = src[unrollCount]; dest += NUM_THREADS * LOOP_UNROLL; src += NUM_THREADS * LOOP_UNROLL; } while (--transferCount); endParallel(); if (currentThread() == 0) { int endTime = getTime(); printf("copy: %g bytes/cycle\n", (float) kTransferSize / (endTime - startTime)); } }
int main() { veci16 *dest = (veci16*) region1Base + __builtin_nyuzi_read_control_reg(0) * LOOP_UNROLL; veci16 values = __builtin_nyuzi_makevectori(0xdeadbeef); int transferCount = kTransferSize / (64 * NUM_STRANDS * LOOP_UNROLL); do { dest[0] = values; dest[1] = values; dest[2] = values; dest[3] = values; dest[4] = values; dest[5] = values; dest[6] = values; dest[7] = values; dest += NUM_STRANDS * LOOP_UNROLL; } while (--transferCount); }
void* memset(void *_dest, int value, size_t length) { char *dest = (char*) _dest; value &= 0xff; // XXX Possibly fill bytes/words until alignment is hit if ((((unsigned int) dest) & 63) == 0) { // Write 64 bytes at a time. veci16_t reallyWideValue = __builtin_nyuzi_makevectori(value | (value << 8) | (value << 16) | (value << 24)); while (length > 64) { *((veci16_t*) dest) = reallyWideValue; length -= 64; dest += 64; } } if ((((unsigned int) dest) & 3) == 0) { // Write 4 bytes at a time. unsigned wideVal = value | (value << 8) | (value << 16) | (value << 24); while (length > 4) { *((unsigned int*) dest) = wideVal; dest += 4; length -= 4; } } // Write one byte at a time while (length > 0) { *dest++ = value; length--; } return _dest; }
int main(void) { veci16_t pointers = { &foo, &foo, &foo, &foo, &foo, &foo, 0x17, &foo, &foo, &foo, &foo, &foo, &foo, &foo, &foo, &foo }; __builtin_nyuzi_write_control_reg(CR_FAULT_HANDLER, (unsigned int) faultHandler); __builtin_nyuzi_write_control_reg(CR_TLB_MISS_HANDLER, (unsigned int) tlb_miss_handler); __builtin_nyuzi_write_control_reg(CR_FLAGS, FLAG_MMU_EN | FLAG_SUPERVISOR_EN); // This ensures the libc functions are mapped into the TLB so we don't generate // multiple TLB misses in the fault handler (doesn't break the test, just makes // debugging cleaner) printf("Starting test %d\n", 12); // This will cause an alignment fault on the 6th lane and jump to 'faultHandler'. // Use scatter store rather than a normal scalar store to ensure the // subcycle counter is saved correctly. __builtin_nyuzi_scatter_storei(pointers, __builtin_nyuzi_makevectori(0)); printf("should_not_be_here\n"); // CHECKN: should_not_be_here return 0; }
// Run 16 parallel hashes void sha2Hash(vecu16_t pointers, int totalBlocks, vecu16_t outHashes) { // Initial H values vecu16_t h0 = __builtin_nyuzi_makevectori(0x6A09E667); vecu16_t h1 = __builtin_nyuzi_makevectori(0xBB67AE85); vecu16_t h2 = __builtin_nyuzi_makevectori(0x3C6EF372); vecu16_t h3 = __builtin_nyuzi_makevectori(0xA54FF53A); vecu16_t h4 = __builtin_nyuzi_makevectori(0x510E527F); vecu16_t h5 = __builtin_nyuzi_makevectori(0x9B05688C); vecu16_t h6 = __builtin_nyuzi_makevectori(0x1F83D9AB); vecu16_t h7 = __builtin_nyuzi_makevectori(0x5BE0CD19); for (int i = 0; i < totalBlocks; i++) { vecu16_t w[64]; for (int index = 0; index < 16; index++) { w[index] = __builtin_nyuzi_gather_loadi(pointers); pointers += __builtin_nyuzi_makevectori(4); } for (int index = 16; index < 64; index++) w[index] = SIG1(w[index - 2]) + w[index - 7] + SIG0(w[index - 15]) + w[index - 16]; vecu16_t a = h0; vecu16_t b = h1; vecu16_t c = h2; vecu16_t d = h3; vecu16_t e = h4; vecu16_t f = h5; vecu16_t g = h6; vecu16_t h = h7; for (int round = 0; round < 64; round++) { vecu16_t temp1 = h + SIG1(e) + CH(e, f, g) + __builtin_nyuzi_makevectori(K[round]) + w[round]; vecu16_t temp2 = SIG0(a) + MA(a, b, c); h = g; g = f; f = e; e = d + temp1; d = c; c = b; b = a; a = temp1 + temp2; } h0 += a; h1 += b; h2 += c; h3 += d; h4 += e; h5 += f; h6 += g; h7 += h; } // doesn't add padding or length fields to end... __builtin_nyuzi_scatter_storei(outHashes, h0); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(4), h1); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(8), h2); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(12), h3); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(16), h4); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(20), h5); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(24), h6); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(28), h7); }
inline vecu16_t ROTR(vecu16_t x, int y) { return (x >> __builtin_nyuzi_makevectori(y)) | (x << (__builtin_nyuzi_makevectori(32 - y))); }