Ejemplos de spu_cntlz en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: strlen.c Proyecto: Kristoffer/Xen-4.1.2

/* Calculates  the  length  of  the string s, not including the terminating
 * \0 character.
 */
size_t strlen(const char *s)
{
  size_t len;
  unsigned int cnt, cmp, skip, mask;
  vec_uchar16 *ptr, data;

  /* Compensate for initial mis-aligned string.
   */
  ptr = (vec_uchar16 *)s;
  skip = (unsigned int)(ptr) & 15;
  mask = 0xFFFF >> skip;

  data = *ptr++;
  cmp = spu_extract(spu_gather(spu_cmpeq(data, 0)), 0);
  cmp &= mask;

  cnt = spu_extract(spu_cntlz(spu_promote(cmp, 0)), 0);
  len = cnt - (skip + 16);

  while (cnt == 32) {
    data = *ptr++;
    len -= 16;
    cnt = spu_extract(spu_cntlz(spu_gather(spu_cmpeq(data, 0))), 0);
    len += cnt;
  }

  return (len);
}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: mfc_multi_tag_reserve.c Proyecto: Akheon23/chromecast-mirrored-source.toolchain

unsigned int
__mfc_multi_tag_reserve (unsigned int number_of_tags)
{
  vector unsigned int table_copy;
  vector unsigned int one = (vector unsigned int)
        { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  vector unsigned int count_busy, is_valid;
  vector unsigned int count_total;
  vector unsigned int count_avail = (vector unsigned int) { 0, 0, 0, 0 };
  vector unsigned int index = (vector unsigned int) { 0, 0, 0, 0 };

  table_copy = __mfc_tag_table;


  /* count_busy: number of consecutive busy tags
     count_avail: number of consecutive free tags
     table_copy: temporary copy of the tag table
     count_total: sum of count_busy and count_avail
     index: index of the current working tag  */
  do
    {
      table_copy = spu_sl (table_copy, count_avail);

      count_busy = spu_cntlz (table_copy);
      table_copy = spu_sl (table_copy, count_busy);
      count_avail = spu_cntlz (spu_xor(table_copy, -1));
      count_total = spu_add (count_busy, count_avail);
      index = spu_add (index, count_total);
    }
  while (spu_extract (count_avail, 0) < number_of_tags
	 && spu_extract (table_copy, 0) != 0);

  index = spu_sub (index, count_avail);

  /* is_valid is set to 0xFFFFFFFF if table_copy == 0, 0 otherwise.  */
  is_valid = spu_cmpeq (table_copy, 0);
  index = spu_sel (index, is_valid, is_valid);

  /* Now I need to actually mark the tags as used.  */
  table_copy = spu_sl (one, number_of_tags);
  table_copy = spu_rl (table_copy, -number_of_tags - spu_extract (index, 0));
  table_copy = spu_sel (table_copy, __mfc_tag_table, table_copy);
  __mfc_tag_table = spu_sel (table_copy, __mfc_tag_table, is_valid);

  return spu_extract (index, 0);
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: testutils.c Proyecto: Mashewnutz/Slo

vec_uint4 bitDiff_f4(vec_float4 ref, vec_float4 vals) {
  vec_int4 refi  = (vec_int4)ref;
  vec_int4 valsi = (vec_int4)vals;
  vec_int4 diff  = spu_sub(refi, valsi);
  vec_int4 negdiff = spu_sub(spu_splats((int)0), diff);

  return spu_sub((vec_uint4)spu_splats(32), spu_cntlz(spu_sel(negdiff, diff, spu_cmpgt(diff, 0))));
}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: cacheline.c Proyecto: ralferoo/spugl

inline void merge_cache_blocks(RenderableCacheLine* cache)
{
    vec_uchar16 next = cache->chunkNext;

    for (;;) {
        vec_uchar16 nextnext = spu_shuffle(next, next, next);
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        vec_ushort8 firstblock0 = spu_cmpeq( cache->chunkStart[0], 0);
        vec_ushort8 firstblock1 = spu_cmpeq( cache->chunkStart[1], 0);
        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 first = (vec_uchar16) spu_shuffle( firstblock0, firstblock1, firstshuf );

        vec_ushort8 tri0 = cache->chunkTriangle[0];
        vec_ushort8 tri1 = cache->chunkTriangle[1];
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 ntri0 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 ntri1 = spu_shuffle( tri0, tri1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 trieq0 = spu_cmpeq( tri0, ntri0 );
        vec_ushort8 trieq1 = spu_cmpeq( tri1, ntri1 );

        vec_uchar16 trieq = (vec_uchar16) spu_shuffle( trieq0, trieq1, MERGE );
        vec_uchar16 combi = spu_orc(first, trieq);

        vec_uchar16 canmerge = spu_cmpgt( spu_nor(spu_or(next, nextnext), combi), 256-CHUNKNEXT_BUSY_BIT );

        vec_uint4 gather = spu_gather( canmerge );

        vec_uint4 mergeid = spu_sub( spu_cntlz( gather ), spu_promote((unsigned int)16, 0));

        if( !spu_extract(gather, 0) ) {
            return;
        }

        //	unsigned int firstchunk = spu_extract(mergeid, 0);
        //	unsigned int nextchunk = cache->chunkNextArray[firstchunk];
        vec_uint4 v_chunkNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(mergeid,13) );
        vec_uint4 v_chunkNextNext = (vec_uint4) si_rotqby( (qword) next, (qword) spu_add(v_chunkNext,13) );

        // cache->chunkNextArray[firstchunk] = cache->chunkNextArray[nextchunk];
        next = spu_shuffle( (vec_uchar16) v_chunkNextNext, next, (vec_uchar16) si_cbd( (qword) mergeid, 0 ) );

        // cache->chunkNextArray[nextchunk] = CHUNKNEXT_FREE_BLOCK;
        next = spu_shuffle( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK), next, (vec_uchar16) si_cbd( (qword) v_chunkNext, 0 ) );

        // this is for debug use only, it's not really needed...
        // cache->chunkStartArray[nextchunk] = -1;
        cache->chunkStartArray[ spu_extract(v_chunkNext,0) & 255 ] = -1;

        cache->chunkNext = next;
    }
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: mfc_tag_reserve.c Proyecto: ChaosJohn/gcc

unsigned int
__mfc_tag_reserve (void)
{
  vector unsigned int mask = (vector unsigned int)
	{ 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  vector unsigned int count_zeros, is_valid;
  vector signed int count_neg;

  count_zeros = spu_cntlz (__mfc_tag_table);
  count_neg = spu_sub (0, (vector signed int) count_zeros);

  mask = spu_rlmask (mask, (vector signed int) count_neg);
  __mfc_tag_table = spu_andc (__mfc_tag_table, mask);

  is_valid = spu_cmpeq (count_zeros, 32);
  count_zeros = spu_sel (count_zeros, is_valid, is_valid);

  return spu_extract (count_zeros, 0);
}

Ejemplo n.º 6

0

Mostrar archivo

Archivo: cacheline.c Proyecto: ralferoo/spugl

void process_render_tasks(unsigned long eah_render_tasks, unsigned long eal_render_tasks)
{
    const vec_uchar16 SHUFFLE_MERGE_BYTES = (vec_uchar16) {	// merge lo bytes from unsigned shorts (array)
        1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
    };

    const vec_uchar16 SHUFFLE_GET_BUSY_WITH_ONES = (vec_uchar16) {	// get busy flag with ones in unused bytes
        0xc0, 0xc0, 2, 3, 0xc0,0xc0,0xc0,0xc0, 0xc0,0xc0,0xc0,0xc0
    };

    const vec_uchar16 ZERO_BYTES = (vec_uchar16) spu_splats(0);

    char trianglebuffer[ 256 + TRIANGLE_MAX_SIZE ];

    char	sync_buffer[128+127];
    void*	aligned_sync_buffer = (void*) ( ((unsigned long)sync_buffer+127) & ~127 );

    RenderableCacheLine* cache = (RenderableCacheLine*) aligned_sync_buffer;
    unsigned long long cache_ea;

    spu_mfcdma64(&cache_ea, eah_render_tasks, eal_render_tasks, sizeof(cache_ea), 0, MFC_GET_CMD);
    mfc_write_tag_mask(1<<0);
    mfc_read_tag_status_all();

    while (cache_ea) {
        // terminate immediately if possible
        if (spu_stat_in_mbox())
            return;

        // read the cache line
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
        spu_readch(MFC_RdAtomicStat);

        unsigned int endTriangle = cache->endTriangle;
        vec_ushort8 testTriangle = spu_splats((unsigned short) endTriangle);

        // first look for short chunks
        vec_uchar16 next = cache->chunkNext;
        vec_uchar16 nextmask = spu_and(next, spu_splats((unsigned char)CHUNKNEXT_MASK));

        // change next to word offset, note we don't care what the low bit shifted in is
        vec_uchar16 firstshuf = (vec_uchar16) spu_sl( (vec_ushort8)nextmask, 1 );
        vec_uchar16 trishufhi = spu_or ( firstshuf, spu_splats((unsigned char) 1));
        vec_uchar16 trishuflo = spu_and( firstshuf, spu_splats((unsigned char) 254));

        vec_ushort8 start0 = cache->chunkStart[0];
        vec_ushort8 start1 = cache->chunkStart[1];

        vec_ushort8 nstart0 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF0 ) );
        vec_ushort8 nstart1 = spu_shuffle( start0, start1, spu_shuffle( trishuflo, trishufhi, SHUF1 ) );

        vec_ushort8 starteq0 = spu_cmpeq( nstart0, spu_splats((unsigned short)0) );
        vec_ushort8 starteq1 = spu_cmpeq( nstart1, spu_splats((unsigned short)0) );

        vec_ushort8 end0 = spu_sel( nstart0, spu_splats((unsigned short)4096), starteq0);
        vec_ushort8 end1 = spu_sel( nstart1, spu_splats((unsigned short)4096), starteq1);

        vec_ushort8 len0 = spu_sub( end0, start0);
        vec_ushort8 len1 = spu_sub( end1, start1);

        vec_ushort8 small0 = spu_cmpgt( spu_splats((unsigned short)17), len0);
        vec_ushort8 small1 = spu_cmpgt( spu_splats((unsigned short)17), len1);
        vec_uchar16 small = (vec_uchar16) spu_shuffle( small0, small1, MERGE );
        vec_uint4 smallChunkGather = spu_gather(small);

        // check to see if chunk is already at the last triangle
        vec_uint4 doneChunkGather = spu_gather( (vec_uchar16) spu_shuffle(
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[0]),
                (vec_uchar16) spu_cmpeq(testTriangle, cache->chunkTriangle[1]),
                SHUFFLE_MERGE_BYTES) );

        // check if the chunk is free
        vec_uint4 freeChunkGather = spu_gather(
                                        spu_cmpeq( spu_splats( (unsigned char) CHUNKNEXT_FREE_BLOCK ), cache->chunkNext ) );

        // check to see if the chunk is being processed
        vec_uint4 busyChunkGather = spu_gather(
                                        spu_cmpgt( cache->chunkNext, //spu_and(cache->chunkNext, CHUNKNEXT_MASK),
                                                spu_splats( (unsigned char) (CHUNKNEXT_BUSY_BIT-1) ) ) );

        // doneChunkGather, freeChunkGather, busyChunkGather - rightmost 16 bits of word 0
        // note that if freeChunkGather is true then busyChunkGather must also be true

        // done=false, free=false, busy=false -> can process
        // free=false, busy=false -> can be merged

        // decide which chunk to process
        vec_uint4 mayProcessGather = spu_nor( doneChunkGather, busyChunkGather );
        vec_uint4 mayProcessShortGather = spu_and( mayProcessGather, smallChunkGather );

        vec_uint4 shortSelMask = spu_cmpeq( mayProcessShortGather, spu_splats(0U) );
        vec_uint4 mayProcessSelection = spu_sel( mayProcessShortGather, mayProcessGather, shortSelMask );

        /*
        		if (!spu_extract(shortSelMask, 0))
        			printf("taken short: may=%04x short=%04x mayshort=%04x mask=%04x sel=%04x\n",
        				spu_extract(mayProcessGather, 0) & 0xffff,
        				spu_extract(smallChunkGather, 0),
        				spu_extract(mayProcessShortGather, 0),
        				spu_extract(shortSelMask, 0) & 0xffff,
        				spu_extract(mayProcessSelection, 0) & 0xffff );
        */

        vec_uint4 mayProcessBits = spu_sl( mayProcessSelection, 16);
        unsigned int chunkToProcess = spu_extract( spu_cntlz( mayProcessBits ), 0);
        unsigned int freeChunk = spu_extract( spu_cntlz( spu_sl( freeChunkGather, 16 ) ), 0);

        // if there's nothing to process, try the next cache line in the rendering tasks list
        if (!spu_extract(mayProcessBits, 0)) {
trynextcacheline:
            cache_ea = cache->next;
            // sleep();
            continue;
        }

        unsigned int chunkStart    	= cache->chunkStartArray   [chunkToProcess];
        unsigned int chunkTriangle	= cache->chunkTriangleArray[chunkToProcess];
        unsigned int chunkNext		= cache->chunkNextArray	   [chunkToProcess] & CHUNKNEXT_MASK;
        unsigned int chunkEnd		= (cache->chunkStartArray  [chunkNext]-1) & (NUMBER_OF_TILES-1);
        unsigned int chunkLength	= 1 + chunkEnd-chunkStart;

        // only need an extra block if the block is especially long
        if (chunkLength <= NUMBER_OF_TILES_PER_CHUNK) {
            freeChunk = 32;
        }

        // mark this block as busy
        cache->chunkNextArray[chunkToProcess] |= CHUNKNEXT_BUSY_BIT;

        // if there's at least one free chunk, claim it
        if (freeChunk != 32) {
            cache->chunkNextArray[freeChunk] = CHUNKNEXT_RESERVED;
            cache->chunkTriangleArray[freeChunk] = chunkTriangle;
        }

        // write the cache line back
        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
            continue;

#ifdef INFO
        printf("[%d] Claimed chunk %d (%d-%d len %d) at tri %x end %x with free chunk %d\n", _SPUID,
               chunkToProcess, chunkStart, chunkEnd, chunkLength, chunkTriangle, endTriangle,
               freeChunk!=32 ? freeChunk : -1 );
//		debug_render_tasks(cache);
#endif

        Triangle* triangle;
        int firstTile;
        do {
            // read the triangle data for the current triangle
            unsigned int extra = chunkTriangle & 127;
            unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
            triangle = (Triangle*) (trianglebuffer+extra);
            unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

            // ensure DMA slot available
            do {} while (!spu_readchcnt(MFC_Cmd));

            spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea), mfc_ea2l(trianglebuffer_ea),
                         length, 0, MFC_GET_CMD);
            mfc_write_tag_mask(1<<0);
            mfc_read_tag_status_all();

            // get the triangle deltas
            firstTile = findFirstTriangleTile(triangle, chunkStart, chunkEnd);

            if (firstTile>=0)
                break;

            // no match, try next triangle
            chunkTriangle = triangle->next_triangle;
        } while (chunkTriangle != endTriangle);

        // if we actually have something to process...
        if (firstTile>=0) {
            // the "normal" splitting will now become:
            // chunkStart .. (firstTile-1)	-> triangle->next_triangle
            // firstTile .. (firstTile+NUM-1) -> chunkTriangle (BUSY)
            // (firstTile+NUM) .. chunkEnd -> chunkTriangle (FREE)

            int tailChunk;
            int thisChunk;
            int nextBlockStart;
            int thisBlockStart;
            int realBlockStart;
            do {
retry:
                // read the cache line
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_GETLLAR_CMD);
                spu_readch(MFC_RdAtomicStat);

                // calculate start of next block
                nextBlockStart = firstTile + NUMBER_OF_TILES_PER_CHUNK;
                if (nextBlockStart > chunkEnd)
                    nextBlockStart = chunkEnd+1;

                // calculate start of block to mark as busy
                thisBlockStart = nextBlockStart - NUMBER_OF_TILES_PER_CHUNK;
                if (thisBlockStart < chunkStart)
                    thisBlockStart = chunkStart;
                realBlockStart = thisBlockStart;

#ifdef INFO
                printf("[%d] nextBlockStart=%d, realBlockStart=%d, thisBlockStart=%d, chunkStart=%d\n", _SPUID,
                       nextBlockStart, realBlockStart, thisBlockStart, chunkStart);
#endif


                // allocate some more free chunks
                vec_uint4 freeChunkGather2 = spu_sl(spu_gather(spu_cmpeq(
                                                        spu_splats((unsigned char)CHUNKNEXT_FREE_BLOCK), cache->chunkNext)), 16);
                unsigned int freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);

                if (freeChunk == 32) {
                    // if we didn't have one before, try again
                    freeChunk = freeChunk2;

                    // and try to get the second one
                    freeChunkGather2 = spu_andc( freeChunkGather2, spu_promote(0x80000000>>freeChunk2, 0) );
                    freeChunk2 = spu_extract(spu_cntlz(freeChunkGather2), 0);
                } else {
                    // speculatively clear the free chunk just in case we don't need it
                    cache->chunkNextArray[freeChunk] = CHUNKNEXT_FREE_BLOCK;
                }

#ifdef INFO
                printf("[%d] Free chunks %d and %d, cN=%d, nBS=%d, cE=%d, tBS=%d, cS=%d\n",
                       _SPUID, freeChunk, freeChunk2, chunkNext, nextBlockStart, chunkEnd, thisBlockStart, chunkStart );
#endif

                // mark region after as available for processing if required
                if (nextBlockStart < chunkEnd) {
                    if (freeChunk==32) {
                        // if no free chunk, relinquish entire block and write back
                        cache->chunkNextArray[chunkToProcess] = chunkNext;
                        spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
                        // if writeback failed, we *might* have a free block, retry
                        if (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS)
                            goto retry;

                        // otherwise give up and try the next cache line
                        goto trynextcacheline;
                    }
                    cache->chunkStartArray[freeChunk] = nextBlockStart;
                    cache->chunkNextArray[freeChunk] = chunkNext;
                    cache->chunkTriangleArray[freeChunk] = chunkTriangle;
                    cache->chunkNextArray[chunkToProcess] = freeChunk | CHUNKNEXT_BUSY_BIT;
                    tailChunk = freeChunk;
#ifdef INFO
                    printf("[%d] Insert tail, tailChunk=%d, chunkNext=%d, chunkToProcess=%d\n", _SPUID, tailChunk, chunkNext, chunkToProcess);
                    debug_render_tasks(cache);
#endif
                } else {
                    // we're gonna use freeChunk2 for the "in front" block, as we've not
                    // used freeChunk, let's use it as it's more likely to have a free chunk
                    freeChunk2 = freeChunk;
                    tailChunk = chunkNext;
                }

                // mark region before as available if required and possible
                thisChunk = chunkToProcess;
                if (thisBlockStart > chunkStart) {
                    if (freeChunk2 != 32) {
                        // mark this region as busy
                        cache->chunkStartArray[freeChunk2]=thisBlockStart;
                        cache->chunkNextArray[freeChunk2]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        cache->chunkTriangleArray[freeChunk2]=chunkTriangle;

                        // mark region before as available for processing
                        cache->chunkNextArray[chunkToProcess]=freeChunk2;
                        cache->chunkTriangleArray[chunkToProcess]=triangle->next_triangle;
                        thisChunk = freeChunk2;
#ifdef INFO
                        printf("[%d] Insert new head, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#endif
                    } else {
                        // need to keep whole block, update info and mark bust
                        cache->chunkTriangleArray[chunkToProcess]=chunkTriangle;
                        cache->chunkNextArray[chunkToProcess]=tailChunk | CHUNKNEXT_BUSY_BIT;
                        realBlockStart = chunkStart;
                        printf("[%d] Keep whole block, tailChunk=%d, chunkNext=%d, thisChunk=%d\n", _SPUID, tailChunk, chunkNext, thisChunk);
                        debug_render_tasks(cache);
#ifdef INFO
#endif
                        sleep();
                    }
                }

                // merge chunks
                merge_cache_blocks(cache);

                // write the cache line back
                spu_mfcdma64(cache, mfc_ea2h(cache_ea), mfc_ea2l(cache_ea), 128, 0, MFC_PUTLLC_CMD);
            } while (spu_readch(MFC_RdAtomicStat) & MFC_PUTLLC_STATUS);

            // finally after the write succeeded, update the variables
            chunkNext = tailChunk;
            chunkToProcess = thisChunk;
            chunkStart = firstTile; //thisBlockStart;
            chunkLength = nextBlockStart - firstTile;
            chunkEnd = chunkStart + chunkLength - 1;
            freeChunk = 32;

            // now we can process the block up to endTriangle
            initTileBuffers(thisBlockStart, chunkEnd);

            int ok=0;
            while (chunkTriangle != endTriangle) {
#ifdef INFO
                printf("[%d] Processing chunk %d at %4d len %4d, triangle %04x first=%d tbs=%d\n",
                       _SPUID, chunkToProcess, chunkStart, chunkLength,
                       chunkTriangle, firstTile, thisBlockStart);
#endif
                // and actually process that triangle on these chunks
                processTriangleChunks(triangle, cache, thisBlockStart, chunkEnd, chunkTriangle, ok);
                ok=1;
#ifdef PAUSE
                sleep();
#endif
                // and advance to the next-triangle
                chunkTriangle = triangle->next_triangle;

                // this should only ever happen if we're running really low on cache line slots
                // basically, if we pick up a block with more than NUMBER_OF_TILES_PER_CHUNK and
                // there's no slot to store the pre-NUMBER_OF_TILES_PER_CHUNK tiles.
                // in this case, we process from thisBlockStart only (because we know that from
                // chunkStart to there has no result) and then we only process one triangle
                if (chunkStart != realBlockStart) {
                    /*
                    printf("[%d] chunkStart=%d != realBlockStart %d, chunkEnd=%d, "
                    	"firstTile=%d chunk=%d\n",
                    	_SPUID, chunkStart, realBlockStart, chunkEnd,
                    	firstTile, chunkToProcess);
                    debug_render_tasks(cache);
                    */

                    // abort the while loop
                    break;
                }

                // read the next triangle
                unsigned int extra = chunkTriangle & 127;
                unsigned long long trianglebuffer_ea = cache_ea + TRIANGLE_OFFSET_FROM_CACHE_LINE + (chunkTriangle & ~127);
                triangle = (Triangle*) (trianglebuffer+extra);
                unsigned int length = (extra + TRIANGLE_MAX_SIZE + 127) & ~127;

                // ensure DMA slot available
                do {} while (!spu_readchcnt(MFC_Cmd));

                spu_mfcdma64(trianglebuffer, mfc_ea2h(trianglebuffer_ea),
                             mfc_ea2l(trianglebuffer_ea), length, 0, MFC_GET_CMD);
                mfc_write_tag_mask(1<<0);
                mfc_read_tag_status_all();
            } // until chunkTriangle == endTriangle

            // flush any output buffers
            flushTileBuffers(thisBlockStart, chunkEnd);

        } // firstTile>=0

Ejemplo n.º 7

0

Mostrar archivo

Archivo: sha256.c Proyecto: Gatz85/cellminer

/*
 * NAME:	sha256->search()
 * DESCRIPTION:	try to find a nonce which satisfies a target hash
 */
int64_t sha256_search(const message_t M,
		      const hash_t target, const hash_t midstate,
		      uint32_t start_nonce, uint32_t range)
{
  uint32_t nonce, stop_nonce = start_nonce + range + (4 - (range % 4)) % 4;
# if !defined(UNROLL_SHA256)
  int t;
# endif
  vec_uint4 W0[3], a0, b0, c0, d0, e0, f0, g0, h0;
  vec_uint4 W[16], a, b, c, d, e, f, g, h, T1, T2;
  vec_uint4 borrow, solution;
  const vec_uchar16 reverse_endian = {
     3,  2,  1,  0,
     7,  6,  5,  4,
    11, 10,  9,  8,
    15, 14, 13, 12
  };

  /* precompute first three rounds */

  a = SPLAT(midstate.words[0]);
  b = SPLAT(midstate.words[1]);
  c = SPLAT(midstate.words[2]);
  d = SPLAT(midstate.words[3]);
  e = SPLAT(midstate.words[4]);
  f = SPLAT(midstate.words[5]);
  g = SPLAT(midstate.words[6]);
  h = SPLAT(midstate.words[7]);

# ifdef UNROLL_SHA256
  W[0] = SPLAT(M.words[0]); ROUND(0);
  W[1] = SPLAT(M.words[1]); ROUND(1);
  W[2] = SPLAT(M.words[2]); ROUND(2);
# else
  for (t = 0; t < 3; ++t) {
    W[t] = SPLAT(M.words[t]);
    ROUND(t);
  }
# endif

  W0[0] = W[0];
  W0[1] = W[1];
  W0[2] = W[2];

  a0 = a;
  b0 = b;
  c0 = c;
  d0 = d;
  e0 = e;
  f0 = f;
  g0 = g;
  h0 = h;

  /* do the search, four at a time */

  for (nonce = start_nonce; nonce != stop_nonce; nonce += 4) {
    W[0] = W0[0];
    W[1] = W0[1];
    W[2] = W0[2];

    a = a0;
    b = b0;
    c = c0;
    d = d0;
    e = e0;
    f = f0;
    g = g0;
    h = h0;

    /* t = 3 */
    W[3] = (vec_uint4) { nonce + 0, nonce + 1, nonce + 2, nonce + 3 };
    ROUND(3);

# ifdef UNROLL_SHA256
    W[ 4] = SPLAT(M.words[ 4]); ROUND( 4);
    W[ 5] = SPLAT(M.words[ 5]); ROUND( 5);
    W[ 6] = SPLAT(M.words[ 6]); ROUND( 6);
    W[ 7] = SPLAT(M.words[ 7]); ROUND( 7);

    W[ 8] = SPLAT(M.words[ 8]); ROUND( 8);
    W[ 9] = SPLAT(M.words[ 9]); ROUND( 9);
    W[10] = SPLAT(M.words[10]); ROUND(10);
    W[11] = SPLAT(M.words[11]); ROUND(11);
    W[12] = SPLAT(M.words[12]); ROUND(12);
    W[13] = SPLAT(M.words[13]); ROUND(13);
    W[14] = SPLAT(M.words[14]); ROUND(14);
    W[15] = SPLAT(M.words[15]); ROUND(15);
# else
    for (t = 4; t < 16; ++t) {
      W[t] = SPLAT(M.words[t]);
      ROUND(t);
    }
# endif

# ifdef UNROLL_SHA256
    W[16 % 16] = W(16); ROUND(16);
    W[17 % 16] = W(17); ROUND(17);
    W[18 % 16] = W(18); ROUND(18);
    W[19 % 16] = W(19); ROUND(19);
    W[20 % 16] = W(20); ROUND(20);
    W[21 % 16] = W(21); ROUND(21);
    W[22 % 16] = W(22); ROUND(22);
    W[23 % 16] = W(23); ROUND(23);

    W[24 % 16] = W(24); ROUND(24);
    W[25 % 16] = W(25); ROUND(25);
    W[26 % 16] = W(26); ROUND(26);
    W[27 % 16] = W(27); ROUND(27);
    W[28 % 16] = W(28); ROUND(28);
    W[29 % 16] = W(29); ROUND(29);
    W[30 % 16] = W(30); ROUND(30);
    W[31 % 16] = W(31); ROUND(31);

    W[32 % 16] = W(32); ROUND(32);
    W[33 % 16] = W(33); ROUND(33);
    W[34 % 16] = W(34); ROUND(34);
    W[35 % 16] = W(35); ROUND(35);
    W[36 % 16] = W(36); ROUND(36);
    W[37 % 16] = W(37); ROUND(37);
    W[38 % 16] = W(38); ROUND(38);
    W[39 % 16] = W(39); ROUND(39);

    W[40 % 16] = W(40); ROUND(40);
    W[41 % 16] = W(41); ROUND(41);
    W[42 % 16] = W(42); ROUND(42);
    W[43 % 16] = W(43); ROUND(43);
    W[44 % 16] = W(44); ROUND(44);
    W[45 % 16] = W(45); ROUND(45);
    W[46 % 16] = W(46); ROUND(46);
    W[47 % 16] = W(47); ROUND(47);

    W[48 % 16] = W(48); ROUND(48);
    W[49 % 16] = W(49); ROUND(49);
    W[50 % 16] = W(50); ROUND(50);
    W[51 % 16] = W(51); ROUND(51);
    W[52 % 16] = W(52); ROUND(52);
    W[53 % 16] = W(53); ROUND(53);
    W[54 % 16] = W(54); ROUND(54);
    W[55 % 16] = W(55); ROUND(55);

    W[56 % 16] = W(56); ROUND(56);
    W[57 % 16] = W(57); ROUND(57);
    W[58 % 16] = W(58); ROUND(58);
    W[59 % 16] = W(59); ROUND(59);
    W[60 % 16] = W(60); ROUND(60);
    W[61 % 16] = W(61); ROUND(61);
    W[62 % 16] = W(62); ROUND(62);
    W[63 % 16] = W(63); ROUND(63);
# else
    for (t = 16; t < 64; ++t) {
      W[t % 16] = W(t);
      ROUND(t);
    }
# endif

    W[0] = ADD(a, midstate.words[0]);
    W[1] = ADD(b, midstate.words[1]);
    W[2] = ADD(c, midstate.words[2]);
    W[3] = ADD(d, midstate.words[3]);
    W[4] = ADD(e, midstate.words[4]);
    W[5] = ADD(f, midstate.words[5]);
    W[6] = ADD(g, midstate.words[6]);
    W[7] = ADD(h, midstate.words[7]);

    /* first SHA-256 complete */

    a = SPLAT(H0.words[0]);
    b = SPLAT(H0.words[1]);
    c = SPLAT(H0.words[2]);
    d = SPLAT(H0.words[3]);
    e = SPLAT(H0.words[4]);
    f = SPLAT(H0.words[5]);
    g = SPLAT(H0.words[6]);
    h = SPLAT(H0.words[7]);

    ROUND(0);
    ROUND(1);
    ROUND(2);
    ROUND(3);
    ROUND(4);
    ROUND(5);
    ROUND(6);
    ROUND(7);

    W[ 8] = SPLAT(0x80000000U); ROUND( 8);

# ifdef UNROLL_SHA256
    W[ 9] = SPLAT(0x00000000U); ROUND( 9);
    W[10] = SPLAT(0x00000000U); ROUND(10);
    W[11] = SPLAT(0x00000000U); ROUND(11);
    W[12] = SPLAT(0x00000000U); ROUND(12);
    W[13] = SPLAT(0x00000000U); ROUND(13);
    W[14] = SPLAT(0x00000000U); ROUND(14);
# else
    for (t = 9; t < 15; ++t) {
      W[t] = SPLAT(0U);
      ROUND(t);
    }
# endif

    W[15] = SPLAT(0x00000100U); ROUND(15);

# ifdef UNROLL_SHA256
    W[16 % 16] = W(16); ROUND(16);
    W[17 % 16] = W(17); ROUND(17);
    W[18 % 16] = W(18); ROUND(18);
    W[19 % 16] = W(19); ROUND(19);
    W[20 % 16] = W(20); ROUND(20);
    W[21 % 16] = W(21); ROUND(21);
    W[22 % 16] = W(22); ROUND(22);
    W[23 % 16] = W(23); ROUND(23);

    W[24 % 16] = W(24); ROUND(24);
    W[25 % 16] = W(25); ROUND(25);
    W[26 % 16] = W(26); ROUND(26);
    W[27 % 16] = W(27); ROUND(27);
    W[28 % 16] = W(28); ROUND(28);
    W[29 % 16] = W(29); ROUND(29);
    W[30 % 16] = W(30); ROUND(30);
    W[31 % 16] = W(31); ROUND(31);

    W[32 % 16] = W(32); ROUND(32);
    W[33 % 16] = W(33); ROUND(33);
    W[34 % 16] = W(34); ROUND(34);
    W[35 % 16] = W(35); ROUND(35);
    W[36 % 16] = W(36); ROUND(36);
    W[37 % 16] = W(37); ROUND(37);
    W[38 % 16] = W(38); ROUND(38);
    W[39 % 16] = W(39); ROUND(39);

    W[40 % 16] = W(40); ROUND(40);
    W[41 % 16] = W(41); ROUND(41);
    W[42 % 16] = W(42); ROUND(42);
    W[43 % 16] = W(43); ROUND(43);
    W[44 % 16] = W(44); ROUND(44);
    W[45 % 16] = W(45); ROUND(45);
    W[46 % 16] = W(46); ROUND(46);
    W[47 % 16] = W(47); ROUND(47);

    W[48 % 16] = W(48); ROUND(48);
    W[49 % 16] = W(49); ROUND(49);
    W[50 % 16] = W(50); ROUND(50);
    W[51 % 16] = W(51); ROUND(51);
    W[52 % 16] = W(52); ROUND(52);
    W[53 % 16] = W(53); ROUND(53);
    W[54 % 16] = W(54); ROUND(54);
    W[55 % 16] = W(55); ROUND(55);

    W[56 % 16] = W(56); ROUND(56);
    W[57 % 16] = W(57); ROUND(57);
    W[58 % 16] = W(58); ROUND(58);
    W[59 % 16] = W(59); ROUND(59);
    /* t = 60..63 delayed */
# else
    for (t = 16; t < 60; ++t) {
      W[t % 16] = W(t);
      ROUND(t);
    }
# endif

    W[60 % 16] = W(60);
    T1 = T1(60, e, f, g, h);

    T2 = ADD(ADD(d, T1), H0.words[7]);

    /* quick check to see if any element of the last word vector is zero */
    if (__builtin_expect(spu_extract(spu_gather(spu_cmpeq(T2, 0)), 0) == 0, 1))
      continue;

    /* we have something interesting; finish the SHA-256 */

    ROUND(60);

# ifdef UNROLL_SHA256
    W[61 % 16] = W(61); ROUND(61);
    W[62 % 16] = W(62); ROUND(62);
    W[63 % 16] = W(63); ROUND(63);
# else
    for (t = 61; t < 64; ++t) {
      W[t % 16] = W(t);
      ROUND(t);
    }
# endif

    a = ADD(a, H0.words[0]);
    b = ADD(b, H0.words[1]);
    c = ADD(c, H0.words[2]);
    d = ADD(d, H0.words[3]);
    e = ADD(e, H0.words[4]);
    f = ADD(f, H0.words[5]);
    g = ADD(g, H0.words[6]);
    h = ADD(h, H0.words[7]);

    /* now do the full (reversed-endian) subtraction */

    borrow = spu_genb(SPLAT(target.words[7]),
		      spu_shuffle(a, a, reverse_endian));
    borrow = spu_genbx(SPLAT(target.words[6]),
		       spu_shuffle(b, b, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[5]),
		       spu_shuffle(c, c, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[4]),
		       spu_shuffle(d, d, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[3]),
		       spu_shuffle(e, e, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[2]),
		       spu_shuffle(f, f, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[1]),
		       spu_shuffle(g, g, reverse_endian), borrow);
    borrow = spu_genbx(SPLAT(target.words[0]),
		       spu_shuffle(h, h, reverse_endian), borrow);

    solution = spu_gather(borrow);

    if (__builtin_expect(spu_extract(solution, 0) == 0, 1))
      continue;

    /* we have a winner */

    return nonce + (spu_extract(spu_cntlz(solution), 0) - 28);
  }

  return -1;
}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: strrchr.c Proyecto: 32bitmicro/newlib-nano-1.0

/* Scans the string pointed to by s for the character c and
 * returns a pointer to the last occurance of c. If
 * c is not found, then NULL is returned.
 */
char * strrchr(const char *s, int c)
{
  int nskip;
  vec_uchar16 *ptr, data, vc;
  vec_uint4 cmp_c, cmp_0, cmp;
  vec_uint4 res_ptr, res_cmp;
  vec_uint4 mask, result;
  vec_uint4 one = spu_splats(0xffffU);
  /* Scan memory array a quadword at a time. Skip leading
   * mis-aligned bytes.
   */
  ptr = (vec_uchar16 *)s;

  nskip = -((unsigned int)(ptr) & 15);
  mask = spu_rlmask(one, nskip);

  vc = spu_splats((unsigned char)(c));

  data = *ptr++;
  ptr = (vec_uchar16 *)((unsigned int)ptr & ~15);

  cmp_c = spu_and(spu_gather(spu_cmpeq(data, vc)), mask);
  cmp_0 = spu_and(spu_gather(spu_cmpeq(data, 0)), mask);

  res_ptr = spu_splats(0U);
  res_cmp = spu_splats(0U);

  while (spu_extract(cmp_0, 0) == 0) {
    cmp = spu_cmpeq(cmp_c, 0);

    res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
    res_cmp = spu_sel(cmp_c, res_cmp, cmp);

    data = *ptr++;

    cmp_c = spu_gather(spu_cmpeq(data, vc));
    cmp_0 = spu_gather(spu_cmpeq(data, 0));

    cmp = spu_cmpeq(cmp_c, 0);
  }

  /* Compute the location of the last character before termination
   * character.
   *
   * First mask off compare results following the first termination character.
   */
  mask = spu_sl(one, 31 - spu_extract(spu_cntlz(cmp_0), 0));
  cmp_c = spu_and(cmp_c, mask);

  /* Conditionally update res_ptr and res_cmd if a match was found in the last
   * quadword.
   */
  cmp = spu_cmpeq(cmp_c, 0);

  res_ptr = spu_sel(spu_promote((unsigned int)(ptr), 0), res_ptr, cmp);
  res_cmp = spu_sel(cmp_c, res_cmp, cmp);

  /* Bit reserve res_cmp for locating last occurance.
   */
  mask = spu_cmpeq(res_cmp, 0);

  res_cmp = (vec_uint4)spu_maskb(spu_extract(res_cmp, 0));
  res_cmp = spu_gather((vec_uchar16)spu_shuffle(res_cmp, res_cmp,
						VEC_LITERAL(vec_uchar16,
							    15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)));

  /* Compute the location (ptr) of the last occurance of c. If no
   * occurance was found (ie, element 0 of res_cmp == 0, then return
   * NULL.
   */
  result = spu_sub(spu_add(res_ptr, 15), spu_cntlz(res_cmp));
  result = spu_andc(result, mask);

  return ((char *)spu_extract(result, 0));
}