/* Calculates the length of the string s, not including the terminating * \0 character. */ size_t strlen(const char *s) { size_t len; unsigned int cnt, cmp, skip, mask; vec_uchar16 *ptr, data; /* Compensate for initial mis-aligned string. */ ptr = (vec_uchar16 *)s; skip = (unsigned int)(ptr) & 15; mask = 0xFFFF >> skip; data = *ptr++; cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0); cmp &= mask; cnt = spu_extract(spu_cntlz(spu_promote(cmp, 0)), 0); len = cnt - (skip + 16); while (cnt == 32) { data = *ptr++; len -= 16; cnt = spu_extract(spu_cntlz(spu_gather(spu_cmpeq(data, 0))), 0); len += cnt; } return (len); }
unsigned int __mfc_multi_tag_reserve (unsigned int number_of_tags) { vector unsigned int table_copy; vector unsigned int one = (vector unsigned int) { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; vector unsigned int count_busy, is_valid; vector unsigned int count_total; vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 }; vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 }; table_copy = __mfc_tag_table; /* count_busy: number of consecutive busy tags count_avail: number of consecutive free tags table_copy: temporary copy of the tag table count_total: sum of count_busy and count_avail index: index of the current working tag */ do { table_copy = spu_sl (table_copy, count_avail); count_busy = spu_cntlz (table_copy); table_copy = spu_sl (table_copy, count_busy); count_avail = spu_cntlz (spu_xor(table_copy, -1)); count_total = spu_add (count_busy, count_avail); index = spu_add (index, count_total); } while (spu_extract (count_avail, 0) < number_of_tags && spu_extract (table_copy, 0) != 0); index = spu_sub (index, count_avail); /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise. */ is_valid = spu_cmpeq (table_copy, 0); index = spu_sel (index, is_valid, is_valid); /* Now I need to actually mark the tags as used. */ table_copy = spu_sl (one, number_of_tags); table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0)); table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy); __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid); return spu_extract (index, 0); }
vec_uint4 bitDiff_f4(vec_float4 ref, vec_float4 vals) { vec_int4 refi = (vec_int4)ref; vec_int4 valsi = (vec_int4)vals; vec_int4 diff = spu_sub(refi, valsi); vec_int4 negdiff = spu_sub(spu_splats((int)0), diff); return spu_sub((vec_uint4)spu_splats(32), spu_cntlz(spu_sel(negdiff, diff, spu_cmpgt(diff, 0)))); }
inline void merge_cache_blocks(RenderableCacheLine* cache) { vec_uchar16 next = cache->chunkNext; for (;;) { vec_uchar16 nextnext = spu_shuffle(next, next, next); vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0); vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf ); vec_ushort8 tri0 = cache->chunkTriangle[0]; vec_ushort8 tri1 = cache->chunkTriangle[1]; vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 ); vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 ); vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE ); vec_uchar16 combi = spu_orc(first, trieq); vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT ); vec_uint4 gather = spu_gather( canmerge ); vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0)); if( !spu_extract(gather, 0) ) { return; } // unsigned int firstchunk = spu_extract(mergeid, 0); // unsigned int nextchunk = cache->chunkNextArray[firstchunk]; vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) ); vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) ); // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk]; next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) ); // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK; next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) ); // this is for debug use only, it's not really needed... // cache->chunkStartArray[nextchunk] = -1; cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1; cache->chunkNext = next; } }
unsigned int __mfc_tag_reserve (void) { vector unsigned int mask = (vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; vector unsigned int count_zeros, is_valid; vector signed int count_neg; count_zeros = spu_cntlz (__mfc_tag_table); count_neg = spu_sub (0, (vector signed int) count_zeros); mask = spu_rlmask (mask, (vector signed int) count_neg); __mfc_tag_table = spu_andc (__mfc_tag_table, mask); is_valid = spu_cmpeq (count_zeros, 32); count_zeros = spu_sel (count_zeros, is_valid, is_valid); return spu_extract (count_zeros, 0); }
void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks) { const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) { // merge lo bytes from unsigned shorts (array) 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 }; const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) { // get busy flag with ones in unused bytes 0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0 }; const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0); char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ]; char sync_buffer[128+127]; void* aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 ); RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer; unsigned long long cache_ea; spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); while (cache_ea) { // terminate immediately if possible if (spu_stat_in_mbox()) return; // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); unsigned int endTriangle = cache->endTriangle; vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle); // first look for short chunks vec_uchar16 next = cache->chunkNext; vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK)); // change next to word offset, note we don't care what the low bit shifted in is vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 ); vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1)); vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254)); vec_ushort8 start0 = cache->chunkStart[0]; vec_ushort8 start1 = cache->chunkStart[1]; vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) ); vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) ); vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) ); vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) ); vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0); vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1); vec_ushort8 len0 = spu_sub( end0, start0); vec_ushort8 len1 = spu_sub( end1, start1); vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0); vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1); vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE ); vec_uint4 smallChunkGather = spu_gather(small); // check to see if chunk is already at the last triangle vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle( (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]), (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]), SHUFFLE_MERGE_BYTES) ); // check if the chunk is free vec_uint4 freeChunkGather = spu_gather( spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) ); // check to see if the chunk is being processed vec_uint4 busyChunkGather = spu_gather( spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK), spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) ); // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0 // note that if freeChunkGather is true then busyChunkGather must also be true // done=false, free=false, busy=false -> can process // free=false, busy=false -> can be merged // decide which chunk to process vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather ); vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather ); vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) ); vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask ); /* if (!spu_extract(shortSelMask, 0)) printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n", spu_extract(mayProcessGather, 0) & 0xffff, spu_extract(smallChunkGather, 0), spu_extract(mayProcessShortGather, 0), spu_extract(shortSelMask, 0) & 0xffff, spu_extract(mayProcessSelection, 0) & 0xffff ); */ vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16); unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0); unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0); // if there's nothing to process, try the next cache line in the rendering tasks list if (!spu_extract(mayProcessBits, 0)) { trynextcacheline: cache_ea = cache->next; // sleep(); continue; } unsigned int chunkStart = cache->chunkStartArray [chunkToProcess]; unsigned int chunkTriangle = cache->chunkTriangleArray[chunkToProcess]; unsigned int chunkNext = cache->chunkNextArray [chunkToProcess] & CHUNKNEXT_MASK; unsigned int chunkEnd = (cache->chunkStartArray [chunkNext]-1) & (NUMBER_OF_TILES-1); unsigned int chunkLength = 1 + chunkEnd-chunkStart; // only need an extra block if the block is especially long if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) { freeChunk = 32; } // mark this block as busy cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT; // if there's at least one free chunk, claim it if (freeChunk != 32) { cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED; cache->chunkTriangleArray[freeChunk] = chunkTriangle; } // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) continue; #ifdef INFO printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID, chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle, freeChunk!=32 ? freeChunk : -1 ); // debug_render_tasks(cache); #endif Triangle* triangle; int firstTile; do { // read the triangle data for the current triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); // get the triangle deltas firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd); if (firstTile>=0) break; // no match, try next triangle chunkTriangle = triangle->next_triangle; } while (chunkTriangle != endTriangle); // if we actually have something to process... if (firstTile>=0) { // the "normal" splitting will now become: // chunkStart .. (firstTile-1) -> triangle->next_triangle // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY) // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE) int tailChunk; int thisChunk; int nextBlockStart; int thisBlockStart; int realBlockStart; do { retry: // read the cache line spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD); spu_readch(MFC_RdAtomicStat); // calculate start of next block nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK; if (nextBlockStart > chunkEnd) nextBlockStart = chunkEnd+1; // calculate start of block to mark as busy thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK; if (thisBlockStart < chunkStart) thisBlockStart = chunkStart; realBlockStart = thisBlockStart; #ifdef INFO printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID, nextBlockStart, realBlockStart, thisBlockStart, chunkStart); #endif // allocate some more free chunks vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq( spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16); unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); if (freeChunk == 32) { // if we didn't have one before, try again freeChunk = freeChunk2; // and try to get the second one freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) ); freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0); } else { // speculatively clear the free chunk just in case we don't need it cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK; } #ifdef INFO printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n", _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart ); #endif // mark region after as available for processing if required if (nextBlockStart < chunkEnd) { if (freeChunk==32) { // if no free chunk, relinquish entire block and write back cache->chunkNextArray[chunkToProcess] = chunkNext; spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); // if writeback failed, we *might* have a free block, retry if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS) goto retry; // otherwise give up and try the next cache line goto trynextcacheline; } cache->chunkStartArray[freeChunk] = nextBlockStart; cache->chunkNextArray[freeChunk] = chunkNext; cache->chunkTriangleArray[freeChunk] = chunkTriangle; cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT; tailChunk = freeChunk; #ifdef INFO printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess); debug_render_tasks(cache); #endif } else { // we're gonna use freeChunk2 for the "in front" block, as we've not // used freeChunk, let's use it as it's more likely to have a free chunk freeChunk2 = freeChunk; tailChunk = chunkNext; } // mark region before as available if required and possible thisChunk = chunkToProcess; if (thisBlockStart > chunkStart) { if (freeChunk2 != 32) { // mark this region as busy cache->chunkStartArray[freeChunk2]=thisBlockStart; cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT; cache->chunkTriangleArray[freeChunk2]=chunkTriangle; // mark region before as available for processing cache->chunkNextArray[chunkToProcess]=freeChunk2; cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle; thisChunk = freeChunk2; #ifdef INFO printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #endif } else { // need to keep whole block, update info and mark bust cache->chunkTriangleArray[chunkToProcess]=chunkTriangle; cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT; realBlockStart = chunkStart; printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk); debug_render_tasks(cache); #ifdef INFO #endif sleep(); } } // merge chunks merge_cache_blocks(cache); // write the cache line back spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD); } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS); // finally after the write succeeded, update the variables chunkNext = tailChunk; chunkToProcess = thisChunk; chunkStart = firstTile; //thisBlockStart; chunkLength = nextBlockStart - firstTile; chunkEnd = chunkStart + chunkLength - 1; freeChunk = 32; // now we can process the block up to endTriangle initTileBuffers(thisBlockStart, chunkEnd); int ok=0; while (chunkTriangle != endTriangle) { #ifdef INFO printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n", _SPUID, chunkToProcess, chunkStart, chunkLength, chunkTriangle, firstTile, thisBlockStart); #endif // and actually process that triangle on these chunks processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok); ok=1; #ifdef PAUSE sleep(); #endif // and advance to the next-triangle chunkTriangle = triangle->next_triangle; // this should only ever happen if we're running really low on cache line slots // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles. // in this case, we process from thisBlockStart only (because we know that from // chunkStart to there has no result) and then we only process one triangle if (chunkStart != realBlockStart) { /* printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, " "firstTile=%d chunk=%d\n", _SPUID, chunkStart, realBlockStart, chunkEnd, firstTile, chunkToProcess); debug_render_tasks(cache); */ // abort the while loop break; } // read the next triangle unsigned int extra = chunkTriangle & 127; unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127); triangle = (Triangle*) (trianglebuffer+extra); unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127; // ensure DMA slot available do {} while (!spu_readchcnt(MFC_Cmd)); spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD); mfc_write_tag_mask(1<<0); mfc_read_tag_status_all(); } // until chunkTriangle == endTriangle // flush any output buffers flushTileBuffers(thisBlockStart, chunkEnd); } // firstTile>=0
/* * NAME: sha256->search() * DESCRIPTION: try to find a nonce which satisfies a target hash */ int64_t sha256_search(const message_t M, const hash_t target, const hash_t midstate, uint32_t start_nonce, uint32_t range) { uint32_t nonce, stop_nonce = start_nonce + range + (4 - (range % 4)) % 4; # if !defined(UNROLL_SHA256) int t; # endif vec_uint4 W0[3], a0, b0, c0, d0, e0, f0, g0, h0; vec_uint4 W[16], a, b, c, d, e, f, g, h, T1, T2; vec_uint4 borrow, solution; const vec_uchar16 reverse_endian = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; /* precompute first three rounds */ a = SPLAT(midstate.words[0]); b = SPLAT(midstate.words[1]); c = SPLAT(midstate.words[2]); d = SPLAT(midstate.words[3]); e = SPLAT(midstate.words[4]); f = SPLAT(midstate.words[5]); g = SPLAT(midstate.words[6]); h = SPLAT(midstate.words[7]); # ifdef UNROLL_SHA256 W[0] = SPLAT(M.words[0]); ROUND(0); W[1] = SPLAT(M.words[1]); ROUND(1); W[2] = SPLAT(M.words[2]); ROUND(2); # else for (t = 0; t < 3; ++t) { W[t] = SPLAT(M.words[t]); ROUND(t); } # endif W0[0] = W[0]; W0[1] = W[1]; W0[2] = W[2]; a0 = a; b0 = b; c0 = c; d0 = d; e0 = e; f0 = f; g0 = g; h0 = h; /* do the search, four at a time */ for (nonce = start_nonce; nonce != stop_nonce; nonce += 4) { W[0] = W0[0]; W[1] = W0[1]; W[2] = W0[2]; a = a0; b = b0; c = c0; d = d0; e = e0; f = f0; g = g0; h = h0; /* t = 3 */ W[3] = (vec_uint4) { nonce + 0, nonce + 1, nonce + 2, nonce + 3 }; ROUND(3); # ifdef UNROLL_SHA256 W[ 4] = SPLAT(M.words[ 4]); ROUND( 4); W[ 5] = SPLAT(M.words[ 5]); ROUND( 5); W[ 6] = SPLAT(M.words[ 6]); ROUND( 6); W[ 7] = SPLAT(M.words[ 7]); ROUND( 7); W[ 8] = SPLAT(M.words[ 8]); ROUND( 8); W[ 9] = SPLAT(M.words[ 9]); ROUND( 9); W[10] = SPLAT(M.words[10]); ROUND(10); W[11] = SPLAT(M.words[11]); ROUND(11); W[12] = SPLAT(M.words[12]); ROUND(12); W[13] = SPLAT(M.words[13]); ROUND(13); W[14] = SPLAT(M.words[14]); ROUND(14); W[15] = SPLAT(M.words[15]); ROUND(15); # else for (t = 4; t < 16; ++t) { W[t] = SPLAT(M.words[t]); ROUND(t); } # endif # ifdef UNROLL_SHA256 W[16 % 16] = W(16); ROUND(16); W[17 % 16] = W(17); ROUND(17); W[18 % 16] = W(18); ROUND(18); W[19 % 16] = W(19); ROUND(19); W[20 % 16] = W(20); ROUND(20); W[21 % 16] = W(21); ROUND(21); W[22 % 16] = W(22); ROUND(22); W[23 % 16] = W(23); ROUND(23); W[24 % 16] = W(24); ROUND(24); W[25 % 16] = W(25); ROUND(25); W[26 % 16] = W(26); ROUND(26); W[27 % 16] = W(27); ROUND(27); W[28 % 16] = W(28); ROUND(28); W[29 % 16] = W(29); ROUND(29); W[30 % 16] = W(30); ROUND(30); W[31 % 16] = W(31); ROUND(31); W[32 % 16] = W(32); ROUND(32); W[33 % 16] = W(33); ROUND(33); W[34 % 16] = W(34); ROUND(34); W[35 % 16] = W(35); ROUND(35); W[36 % 16] = W(36); ROUND(36); W[37 % 16] = W(37); ROUND(37); W[38 % 16] = W(38); ROUND(38); W[39 % 16] = W(39); ROUND(39); W[40 % 16] = W(40); ROUND(40); W[41 % 16] = W(41); ROUND(41); W[42 % 16] = W(42); ROUND(42); W[43 % 16] = W(43); ROUND(43); W[44 % 16] = W(44); ROUND(44); W[45 % 16] = W(45); ROUND(45); W[46 % 16] = W(46); ROUND(46); W[47 % 16] = W(47); ROUND(47); W[48 % 16] = W(48); ROUND(48); W[49 % 16] = W(49); ROUND(49); W[50 % 16] = W(50); ROUND(50); W[51 % 16] = W(51); ROUND(51); W[52 % 16] = W(52); ROUND(52); W[53 % 16] = W(53); ROUND(53); W[54 % 16] = W(54); ROUND(54); W[55 % 16] = W(55); ROUND(55); W[56 % 16] = W(56); ROUND(56); W[57 % 16] = W(57); ROUND(57); W[58 % 16] = W(58); ROUND(58); W[59 % 16] = W(59); ROUND(59); W[60 % 16] = W(60); ROUND(60); W[61 % 16] = W(61); ROUND(61); W[62 % 16] = W(62); ROUND(62); W[63 % 16] = W(63); ROUND(63); # else for (t = 16; t < 64; ++t) { W[t % 16] = W(t); ROUND(t); } # endif W[0] = ADD(a, midstate.words[0]); W[1] = ADD(b, midstate.words[1]); W[2] = ADD(c, midstate.words[2]); W[3] = ADD(d, midstate.words[3]); W[4] = ADD(e, midstate.words[4]); W[5] = ADD(f, midstate.words[5]); W[6] = ADD(g, midstate.words[6]); W[7] = ADD(h, midstate.words[7]); /* first SHA-256 complete */ a = SPLAT(H0.words[0]); b = SPLAT(H0.words[1]); c = SPLAT(H0.words[2]); d = SPLAT(H0.words[3]); e = SPLAT(H0.words[4]); f = SPLAT(H0.words[5]); g = SPLAT(H0.words[6]); h = SPLAT(H0.words[7]); ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); W[ 8] = SPLAT(0x80000000U); ROUND( 8); # ifdef UNROLL_SHA256 W[ 9] = SPLAT(0x00000000U); ROUND( 9); W[10] = SPLAT(0x00000000U); ROUND(10); W[11] = SPLAT(0x00000000U); ROUND(11); W[12] = SPLAT(0x00000000U); ROUND(12); W[13] = SPLAT(0x00000000U); ROUND(13); W[14] = SPLAT(0x00000000U); ROUND(14); # else for (t = 9; t < 15; ++t) { W[t] = SPLAT(0U); ROUND(t); } # endif W[15] = SPLAT(0x00000100U); ROUND(15); # ifdef UNROLL_SHA256 W[16 % 16] = W(16); ROUND(16); W[17 % 16] = W(17); ROUND(17); W[18 % 16] = W(18); ROUND(18); W[19 % 16] = W(19); ROUND(19); W[20 % 16] = W(20); ROUND(20); W[21 % 16] = W(21); ROUND(21); W[22 % 16] = W(22); ROUND(22); W[23 % 16] = W(23); ROUND(23); W[24 % 16] = W(24); ROUND(24); W[25 % 16] = W(25); ROUND(25); W[26 % 16] = W(26); ROUND(26); W[27 % 16] = W(27); ROUND(27); W[28 % 16] = W(28); ROUND(28); W[29 % 16] = W(29); ROUND(29); W[30 % 16] = W(30); ROUND(30); W[31 % 16] = W(31); ROUND(31); W[32 % 16] = W(32); ROUND(32); W[33 % 16] = W(33); ROUND(33); W[34 % 16] = W(34); ROUND(34); W[35 % 16] = W(35); ROUND(35); W[36 % 16] = W(36); ROUND(36); W[37 % 16] = W(37); ROUND(37); W[38 % 16] = W(38); ROUND(38); W[39 % 16] = W(39); ROUND(39); W[40 % 16] = W(40); ROUND(40); W[41 % 16] = W(41); ROUND(41); W[42 % 16] = W(42); ROUND(42); W[43 % 16] = W(43); ROUND(43); W[44 % 16] = W(44); ROUND(44); W[45 % 16] = W(45); ROUND(45); W[46 % 16] = W(46); ROUND(46); W[47 % 16] = W(47); ROUND(47); W[48 % 16] = W(48); ROUND(48); W[49 % 16] = W(49); ROUND(49); W[50 % 16] = W(50); ROUND(50); W[51 % 16] = W(51); ROUND(51); W[52 % 16] = W(52); ROUND(52); W[53 % 16] = W(53); ROUND(53); W[54 % 16] = W(54); ROUND(54); W[55 % 16] = W(55); ROUND(55); W[56 % 16] = W(56); ROUND(56); W[57 % 16] = W(57); ROUND(57); W[58 % 16] = W(58); ROUND(58); W[59 % 16] = W(59); ROUND(59); /* t = 60..63 delayed */ # else for (t = 16; t < 60; ++t) { W[t % 16] = W(t); ROUND(t); } # endif W[60 % 16] = W(60); T1 = T1(60, e, f, g, h); T2 = ADD(ADD(d, T1), H0.words[7]); /* quick check to see if any element of the last word vector is zero */ if (__builtin_expect(spu_extract(spu_gather(spu_cmpeq(T2, 0)), 0) == 0, 1)) continue; /* we have something interesting; finish the SHA-256 */ ROUND(60); # ifdef UNROLL_SHA256 W[61 % 16] = W(61); ROUND(61); W[62 % 16] = W(62); ROUND(62); W[63 % 16] = W(63); ROUND(63); # else for (t = 61; t < 64; ++t) { W[t % 16] = W(t); ROUND(t); } # endif a = ADD(a, H0.words[0]); b = ADD(b, H0.words[1]); c = ADD(c, H0.words[2]); d = ADD(d, H0.words[3]); e = ADD(e, H0.words[4]); f = ADD(f, H0.words[5]); g = ADD(g, H0.words[6]); h = ADD(h, H0.words[7]); /* now do the full (reversed-endian) subtraction */ borrow = spu_genb(SPLAT(target.words[7]), spu_shuffle(a, a, reverse_endian)); borrow = spu_genbx(SPLAT(target.words[6]), spu_shuffle(b, b, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[5]), spu_shuffle(c, c, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[4]), spu_shuffle(d, d, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[3]), spu_shuffle(e, e, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[2]), spu_shuffle(f, f, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[1]), spu_shuffle(g, g, reverse_endian), borrow); borrow = spu_genbx(SPLAT(target.words[0]), spu_shuffle(h, h, reverse_endian), borrow); solution = spu_gather(borrow); if (__builtin_expect(spu_extract(solution, 0) == 0, 1)) continue; /* we have a winner */ return nonce + (spu_extract(spu_cntlz(solution), 0) - 28); } return -1; }
/* Scans the string pointed to by s for the character c and * returns a pointer to the last occurance of c. If * c is not found, then NULL is returned. */ char * strrchr(const char *s, int c) { int nskip; vec_uchar16 *ptr, data, vc; vec_uint4 cmp_c, cmp_0, cmp; vec_uint4 res_ptr, res_cmp; vec_uint4 mask, result; vec_uint4 one = spu_splats(0xffffU); /* Scan memory array a quadword at a time. Skip leading * mis-aligned bytes. */ ptr = (vec_uchar16 *)s; nskip = -((unsigned int)(ptr) & 15); mask = spu_rlmask(one, nskip); vc = spu_splats((unsigned char)(c)); data = *ptr++; ptr = (vec_uchar16 *)((unsigned int)ptr & ~15); cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask); cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask); res_ptr = spu_splats(0U); res_cmp = spu_splats(0U); while (spu_extract(cmp_0, 0) == 0) { cmp = spu_cmpeq(cmp_c, 0); res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp); res_cmp = spu_sel(cmp_c, res_cmp, cmp); data = *ptr++; cmp_c = spu_gather(spu_cmpeq(data, vc)); cmp_0 = spu_gather(spu_cmpeq(data, 0)); cmp = spu_cmpeq(cmp_c, 0); } /* Compute the location of the last character before termination * character. * * First mask off compare results following the first termination character. */ mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0)); cmp_c = spu_and(cmp_c, mask); /* Conditionally update res_ptr and res_cmd if a match was found in the last * quadword. */ cmp = spu_cmpeq(cmp_c, 0); res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp); res_cmp = spu_sel(cmp_c, res_cmp, cmp); /* Bit reserve res_cmp for locating last occurance. */ mask = spu_cmpeq(res_cmp, 0); res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0)); res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp, VEC_LITERAL(vec_uchar16, 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0))); /* Compute the location (ptr) of the last occurance of c. If no * occurance was found (ie, element 0 of res_cmp == 0, then return * NULL. */ result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp)); result = spu_andc(result, mask); return ((char *)spu_extract(result, 0)); }