void foo (char *p) { __builtin_prefetch (p, 0, 0); __builtin_prefetch (p, 0, 1); __builtin_prefetch (p, 0, 2); __builtin_prefetch (p, 0, 3); __builtin_prefetch (p, 1, 0); __builtin_prefetch (p, 1, 1); __builtin_prefetch (p, 1, 2); __builtin_prefetch (p, 1, 3); }
//list get next is just a macro that either calls this for maps, or returns Node->next ListNode *MapGetNext(ListNode *CurrItem) { ListNode *SubNode, *Head; if (! CurrItem) return(NULL); if (CurrItem->Next) { if (CurrItem->Next->Next) { //it's unlikely that we will be looking up the same item again, because maps maintain seperate chains of items //and the likelyhood of hitting the same chain twice is low. THIS IS NOT TRUE FOR REPEATED LOOKUPS ON A LIST //because with a list we go through the same items over and over again whenever looking for items in the chain //Thus for maps we call this prefetch code, which prefetches into the L1 cache, but not into the larger, long-term //L2 cache. As we're unlikely to be revisiting this chain in the near future, we don't want to pollute the L2 //cache with it //This is a disaster for straight forward lists though, because they have only one chain that gets revisited on //every search for an item __builtin_prefetch (CurrItem->Next->Next, 0, 0); if (CurrItem->Next->Next->Tag) __builtin_prefetch (CurrItem->Next->Next->Tag, 0, 0); } return(CurrItem->Next); } if (CurrItem->Flags & LIST_FLAG_MAP_HEAD) { CurrItem=(ListNode *) CurrItem->Item; if (CurrItem->Next) return(CurrItem->Next); } //'Head' here points to a BUCKET HEADER. These are marked with this flag, except the last one //so we know when we've reached the end Head=ListGetHead(CurrItem); while (Head->Flags & LIST_FLAG_MAP_CHAIN) { Head++; if (Head->Next) return(Head->Next); } return(NULL); }
int foo() { int a; __builtin_prefetch(&a); __builtin_prefetch(&a, 1); __builtin_prefetch(&a, 1, 2); __builtin_prefetch(&a, 1, 9, 3); // expected-error{{too many arguments to function}} __builtin_prefetch(&a, "hello", 2); // expected-error{{argument to __builtin_prefetch must be a constant integer}} __builtin_prefetch(&a, 2); // expected-error{{argument should be a value from 0 to 1}} __builtin_prefetch(&a, 0, 4); // expected-error{{argument should be a value from 0 to 3}} __builtin_prefetch(&a, -1, 4); // expected-error{{argument should be a value from 0 to 1}} }
void dmx_set_send_data(const uint8_t *data, uint16_t length) { do { dmb(); } while (dmx_send_state != IDLE && dmx_send_state != DMXINTER); __builtin_prefetch(data); memcpy(dmx_data[0].data, data, (size_t)length); dmx_set_send_data_length(length); }
void prefetch(void const* pointer) { #ifdef BOOST_SIMD_ARCH_X86 #ifdef __GNUC__ __builtin_prefetch(pointer, 0, 0); #elif defined( BOOST_SIMD_HAS_SSE_SUPPORT ) _mm_prefetch( static_cast<char const *>(pointer), Strategy); #endif #endif }
void good_enum (const int *p) { __builtin_prefetch (p, read, none); __builtin_prefetch (p, read, low); __builtin_prefetch (p, read, moderate); __builtin_prefetch (p, read, high); __builtin_prefetch (p, write, none); __builtin_prefetch (p, write, low); __builtin_prefetch (p, write, moderate); __builtin_prefetch (p, write, high); }
void good (int *p) { __builtin_prefetch (p, 0, 0); __builtin_prefetch (p, 0, 1); __builtin_prefetch (p, 0, 2); __builtin_prefetch (p, 0, 3); __builtin_prefetch (p, 1, 0); __builtin_prefetch (p, 1, 1); __builtin_prefetch (p, 1, 2); __builtin_prefetch (p, 1, 3); }
void good_const (const int *p) { __builtin_prefetch (p, 0, 0); __builtin_prefetch (p, 0, 1); __builtin_prefetch (p, 0, 2); __builtin_prefetch (p, READ_ACCESS, 3); __builtin_prefetch (p, 1, NO_TEMPORAL_LOCALITY); __builtin_prefetch (p, 1, LOW_TEMPORAL_LOCALITY); __builtin_prefetch (p, 1, MODERATE_TEMPORAL_LOCALITY); __builtin_prefetch (p, WRITE_ACCESS, HIGH_TEMPORAL_LOCALITY); }
void GLMatrix<GLfloat>::glVertex3v(int num, const GLfloat* v_arr) { #ifdef GLMATRIX_USE_SSE __builtin_prefetch(v_arr); sse_vector r0,r1,r2; register sse_v4sf m_col0,m_col1,m_col2,m_col3; m_col0 = __builtin_ia32_loadaps(m); m_col1 = __builtin_ia32_loadaps(m+4); m_col2 = __builtin_ia32_loadaps(m+8); m_col3 = __builtin_ia32_loadaps(m+12); for(register int k = 0; k < num; ++k) { //load x,y,z r0.v4sf = __builtin_ia32_loadss(v_arr); r1.v4sf = __builtin_ia32_loadss(v_arr+1); r2.v4sf = __builtin_ia32_loadss(v_arr+2); //extend into all 4 single floats r0.v4sf = __builtin_ia32_shufps(r0.v4sf,r0.v4sf,0x00); r1.v4sf = __builtin_ia32_shufps(r1.v4sf,r1.v4sf,0x00); r2.v4sf = __builtin_ia32_shufps(r2.v4sf,r2.v4sf,0x00); //do the mults r0.v4sf = __builtin_ia32_mulps(r0.v4sf,m_col0); v_arr+=3; r1.v4sf = __builtin_ia32_mulps(r1.v4sf,m_col1); //add it all up and, voila r2.v4sf = __builtin_ia32_mulps(r2.v4sf,m_col2); r0.v4sf = __builtin_ia32_addps(r0.v4sf,r1.v4sf); r2.v4sf = __builtin_ia32_addps(r2.v4sf,m_col3); r0.v4sf = __builtin_ia32_addps(r0.v4sf,r2.v4sf); ::glVertex4fv(r0.f); } #else register GLfloat ret[3]; register GLfloat recip; for(register int k = 0; k < num; ++k) { ret[0] = v_arr[k*3]*m0 + v_arr[1+k*3]*m4 + v_arr[2+k*3]*m8 + m12; ret[1] = v_arr[k*3]*m1 + v_arr[1+k*3]*m5 + v_arr[2+k*3]*m9 + m13; ret[2] = v_arr[k*3]*m2 + v_arr[1+k*3]*m6 + v_arr[2+k*3]*m10 + m14; recip = 1/(v_arr[k*3]*m3 + v_arr[1+k*3]*m7 + v_arr[2+k*3]*m11 + m15); ret[0] *= recip; ret[1] *= recip; ret[2] *= recip; ::glVertex3fv(ret); } #endif }
/* Returns 1 when element exists. Returns 0 when endpoint reached. */ int iter_next (iterator_t *itr, void **addr) { assert (itr); if (itr->current_index >= itr->val->next_insert_pos) { /* Should hop to a different chunk. */ if (itr->val->next_val) { /* Hop to next chunk on the same list. */ itr->val = itr->val->next_val; itr->current_index = 0; /* Prefetch next chunk on the same list. */ if (itr->val->next_val) { __builtin_prefetch (itr->val->next_val->array, 0, 0); } } else if (itr->current_list + 1 < itr->next_insert_pos) { /* Hop to the first block of next list. */ itr->current_list += 1; itr->val = itr->list_array[itr->current_list]->vals; itr->current_index = 0; /* Prefetch next chunk on the same list. */ if (itr->val->next_val) { __builtin_prefetch (itr->val->next_val->array, 0, 0); } } else { /* Endpoint reached. */ *addr = NULL; return 0; } } *addr = itr->val->array[itr->current_index++]; return 1; }
inline void prefetch(const void *ptr, size_t offset = 32*10) { #if defined __GNUC__ __builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset); #elif defined _MSC_VER && defined CAROTENE_NEON __prefetch(reinterpret_cast<const char*>(ptr) + offset); #else (void)ptr; (void)offset; #endif }
void mypp_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY) { // limited implementation assert(Order==CblasRowMajor); assert(Uplo==CblasUpper); assert(N==lda); __builtin_prefetch (Y, 1, 3); __builtin_prefetch (X, 1, 3); int i,j; double temp, reg1, reg2; const double *pA, *pX; double* pY = Y; pA = A; pX = X; // y = beta*y for(i=0;i<lda;i++,pY+=incY) (*pY) = beta * (*pY); // reset pointers pY = Y; for(i=0;i<lda;i++,pA+=i,pY+=incY) { pX = X + i*incX; reg1 = (*pX++); (*pY) += alpha * (*pA++) * reg1; temp = 0.0; for(j=i+1;j<N;j++,pA++,pX+=incX) { reg2 = alpha * (*pA); temp += reg2 * (*pX); Y[j*incY] += reg2 * reg1; } (*pY) += temp; } }
hkey_t hash_table_find_or_insert(HashTable *ht, const BinaryKmer key, bool *found) { const BinaryKmer *ptr; size_t i; uint_fast32_t h; #ifdef HASH_PREFETCH uint_fast32_t h2 = binary_kmer_hash(key,ht->seed+0) & ht->hash_mask; __builtin_prefetch(ht_bckt_ptr(ht, h2), 0, 1); #endif for(i = 0; i < REHASH_LIMIT; i++) { #ifdef HASH_PREFETCH h = h2; if(ht->buckets[h][HT_BSIZE] == ht->bucket_size) { h2 = binary_kmer_hash(key,ht->seed+i+1) & ht->hash_mask; __builtin_prefetch(ht_bckt_ptr(ht, h2), 0, 1); } #else h = binary_kmer_hash(key,ht->seed+i) & ht->hash_mask; #endif ptr = hash_table_find_in_bucket_mt(ht, h, key); if(ptr != NULL) { *found = true; return (hkey_t)(ptr - ht->table); } else if(ht->buckets[h][HT_BITEMS] < ht->bucket_size) { *found = false; ptr = hash_table_insert_in_bucket(ht, h, key); ht->collisions[i]++; // only increment collisions when inserting ht->num_kmers++; return (hkey_t)(ptr - ht->table); } } rehash_error_exit(ht); }
void prefetch() const { #if defined(__x86_64__) HPX_ASSERT(sizeof(void*) == 8); #else HPX_ASSERT(sizeof(void*) == 4); #endif __builtin_prefetch(m_sp, 1, 3); __builtin_prefetch(m_sp, 0, 3); __builtin_prefetch(static_cast<void**>(m_sp) + 64 / sizeof(void*), 1, 3); __builtin_prefetch(static_cast<void**>(m_sp) + 64 / sizeof(void*), 0, 3); #if !defined(__x86_64__) __builtin_prefetch(static_cast<void**>(m_sp) + 32 / sizeof(void*), 1, 3); __builtin_prefetch(static_cast<void**>(m_sp) + 32 / sizeof(void*), 0, 3); __builtin_prefetch(static_cast<void**>(m_sp) - 32 / sizeof(void*), 1, 3); __builtin_prefetch(static_cast<void**>(m_sp) - 32 / sizeof(void*), 0, 3); #endif __builtin_prefetch(static_cast<void**>(m_sp) - 64 / sizeof(void*), 1, 3); __builtin_prefetch(static_cast<void**>(m_sp) - 64 / sizeof(void*), 0, 3); }
// tries to put array of words in cache void bitset_cache_prefetch(bitset_container_t* B) { #ifdef IS_X64 const int32_t CACHELINESIZE = computecacheline(); // 64 bytes per cache line #else const int32_t CACHELINESIZE = 64; #endif for (int32_t k = 0; k < BITSET_CONTAINER_SIZE_IN_WORDS; k += CACHELINESIZE / (int32_t)sizeof(uint64_t)) { __builtin_prefetch(B->array + k); } }
/** * trie_lookup: * @trie: A #Trie. * @key: The key to lookup. * * Looks up @key in @trie and returns the value associated. * * Returns: (transfer none): The value inserted or %NULL. */ gpointer trie_lookup (Trie *trie, const gchar *key) { TrieNode *node; __builtin_prefetch(trie); __builtin_prefetch(key); g_return_val_if_fail(trie, NULL); g_return_val_if_fail(key, NULL); node = trie->root; while (*key && node) { node = trie_find_node(trie, node, *key); key++; } return node ? node->value : NULL; }
int main(int argc, char **argv) { int a; a = __builtin_bswap32(a); a = __builtin_bswap64(a); a = __builtin_constant_p(1); a = __builtin_constant_p("string"); char *b = __builtin_strchr("string", 's'); a = __builtin_expect(1, a); a = __builtin_strlen("string"); a = __builtin_strcmp("string1", "string2"); a = __builtin_offsetof(struct point, y); char c[100]; b = __builtin_strcpy(c, "a"); b = __builtin_strncpy(c, "a", 1); a = __builtin_ctzl(a); varargsfn(0); __builtin_prefetch(b); __builtin_prefetch(b, 1); __builtin_prefetch(b, 1, 1); return a; }
void dmx_set_send_data_without_sc(const uint8_t *data, uint16_t length) { do { dmb(); } while (dmx_send_state != IDLE && dmx_send_state != DMXINTER); dmx_data[0].data[0] = DMX512_START_CODE; __builtin_prefetch(data); memcpy(&dmx_data[0].data[1], data, (size_t) length); dmx_set_send_data_length(length + 1); }
inline void mismatch_prefetch( Iterator& f1, Iterator& e1, Iterator& f2, Iterator& e2, Compare & c, Persistence_data & d, std::size_t& n, ctl::detail::term_z2_tag t) const { while( f1 != e1 && f2 != e2){ if( c(*f1, *f2)){ const auto i = d.cascade_boundary_map[ f1->cell()].rbegin(); __builtin_prefetch( std::addressof( *i)); break; } if( c( *f2, *f1)){ const auto i = d.cascade_boundary_map[ f2->cell()].rbegin(); __builtin_prefetch( std::addressof( *i)); break; } ++f1, ++f2; n -= 2; } }
unsigned char *fpfcmdec64(unsigned char *in, unsigned n, uint64_t *out, uint64_t start) { uint64_t *op, htab[1<<HBITS] = {0}, h = 0, _p[VSIZE+32],*p; unsigned char *ip = in; #define FD64(i) { uint64_t u = DEC64(p[i], htab[h]); op[i] = u; htab[h] = u; h = HASH64(h,u); } for(op = (uint64_t*)out; op != out+(n&~(VSIZE-1)); ) { __builtin_prefetch(ip+512, 0); for(ip = p4dec64(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD64(0); FD64(1); FD64(2); FD64(3); } } if(n = ((uint64_t *)out+n) - op) for(ip = p4dec64(ip, n, _p), p = _p; p != &_p[n]; p++,op++) FD64(0); return ip; }
unsigned char *fpdfcmdec64(unsigned char *in, unsigned n, uint64_t *out, uint64_t start) { unsigned char *ip = in; uint64_t _p[VSIZE+32], *op, h = 0, *p, htab[1<<HBITS] = {0}; htab[0] = start; #define DD64(i) { uint64_t u = DEC64(p[i], (htab[h]+start)); op[i] = u; htab[h] = start = u-start; h = HASH64(h,start); start = u; } for(op = (uint64_t*)out; op != out+(n&~(VSIZE-1)); ) { __builtin_prefetch(ip+512, 0); for(ip = p4dec64(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { DD64(0); DD64(1); DD64(2); DD64(3); } } if(n = ((uint64_t *)out+n) - op) for(ip = p4dec64(ip, n, _p), p = _p; p != &_p[n]; p++,op++) DD64(0); return ip; }
[[gnu::hot, gnu::pure]] /*Rating rate(const TicTacBoard& board){ if(board.isWon()){ if((board.wonState & 0x6) == 0x2) return Ratings::RATING_P1_WON; if((board.wonState & 0x6) == 0x4) return Ratings::RATING_P2_WON; return 0; } unsigned index = ((FieldBits) board.setPlayerOne << 9) + (FieldBits) board.setPlayerTwo; return ratingTable[index]; }*/ Rating rate(const TicTacBoard& board){ if(__builtin_expect(board.safe, true)) return board.rating; if(board.isWon()){ if((board.wonState & 0x6) == 0x2) return Ratings::RATING_P1_WON; if((board.wonState & 0x6) == 0x4) return Ratings::RATING_P2_WON; return 0; } FieldBits setP1 = board.setPlayerOne.bitsUsed; FieldBits setP2 = board.setPlayerTwo.bitsUsed; __builtin_prefetch(singleRatingTable+setP1); __builtin_prefetch(singleRatingTable+setP2); FieldBits chancesP1 = chancesTable[setP1]; FieldBits chancesP2 = chancesTable[setP2]; signed chancesDiff = __builtin_popcount(chancesP1 & ~setP2) -__builtin_popcount(chancesP2 & ~setP1); Rating rate = 0; rate += chancesDiff * chance_bonus; rate += singleRatingTable[setP1]; rate -= singleRatingTable[setP2]; rate = std::max(-minmaxscore, std::min(minmaxscore, rate)); board.safe = true; board.rating = rate; return rate; }
void *reader( void *ptr) { struct timespec start, end; int tid = *((int *) ptr); uint64_t seed = 0xdeadbeef + tid; int sum = 0, i; /** < The node and lock to use in an iteration */ int node_id[BATCH_SIZE], lock_id[BATCH_SIZE], I; /** < Total number of iterations (for measurement) */ int num_iters = 0; clock_gettime(CLOCK_REALTIME, &start); while(1) { if(num_iters >= ITERS_PER_MEASUREMENT) { clock_gettime(CLOCK_REALTIME, &end); double seconds = (end.tv_sec - start.tv_sec) + (double) (end.tv_nsec - start.tv_nsec) / GHZ_CPS; printf("Reader thread %d: rate = %.2f M/s. Sum = %d\n", tid, num_iters / (1000000 * seconds), sum); num_iters = 0; clock_gettime(CLOCK_REALTIME, &start); } for(I = 0; I < BATCH_SIZE; I ++) { for(i = 0; i < COMPUTE; i ++) { node_id[I] = fastrand(&seed) & NUM_NODES_; } lock_id[I] = node_id[I] & NUM_LOCKS_; __builtin_prefetch(&locks[lock_id[I]], 0, 0); } for(I = 0; I < BATCH_SIZE; I ++) { pthread_spin_lock(&locks[lock_id[I]].lock); /** < Critical section begin */ nodes[node_id[I]].a ++; nodes[node_id[I]].b ++; /** < Critical section end */ pthread_spin_unlock(&locks[lock_id[I]].lock); num_iters ++; } } }
/* Traverse the hardware receive descriptor ring. * Process each packet that is ready. * Return the updated ring index. */ int firehose_callback_v1(const char *pciaddr, char **packets, struct firehose_rdesc *rxring, int ring_size, int index) { while (rxring[index].status & 1) { int next_index = (index + 1) & (ring_size-1); __builtin_prefetch(packets[next_index]); firehose_packet(pciaddr, packets[index], rxring[index].length); rxring[index].status = 0; /* reset descriptor for reuse */ index = next_index; } return index; }
// tries to put the array in cache void array_cache_prefetch(array_container_t* B) { #ifdef IS_X64 const int32_t CACHELINESIZE = computecacheline(); // 64 bytes per cache line #else const int32_t CACHELINESIZE = 64; #endif #if !(defined(_MSC_VER) && !defined(__clang__)) for (int32_t k = 0; k < B->cardinality; k += CACHELINESIZE / (int32_t)sizeof(uint16_t)) { __builtin_prefetch(B->array + k); } #endif }
// Here we make loads with regular prefetch. The loop is unrolled // with factor 2 static double prefetchSumm(const double * data) { double res = 0; int interval = 32; for(int i = 0; i < ARR_SIZE; i+= unroll) { __builtin_prefetch(&data[i + interval], 0, 0); res += data[i] * A * B + C - D * E; res += data[i + 1] * A * B + C - D * E; } return res; }
//---- FCM: Finite Context Method Predictor unsigned char *fpfcmenc64(uint64_t *in, unsigned n, unsigned char *out, uint64_t start) { uint64_t *ip, htab[1<<HBITS] = {0}, h = 0, _p[VSIZE], *p; unsigned char *op = out; #define FE64(i) { uint64_t u = ip[i]; p[i] = ENC64(u, htab[h]); htab[h] = u; h = HASH64(h,u); } for(ip = (uint64_t *)in; ip != in + (n&~(VSIZE-1)); ) { for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE64(0); FE64(1); FE64(2); FE64(3); } op = p4enc64(_p, VSIZE, op); __builtin_prefetch(ip+512, 0); } if(n = ((uint64_t *)in+n)-ip) { for(p = _p; p != &_p[n]; p++,ip++) FE64(0); op = p4enc64(_p, n, op); } return op; }
void tracingTask(Worker *me, void *arg) { // TODO: arg parse int listsPerCoro = TOTAL_LISTS/CORO_NUM; int remainder = TOTAL_LISTS%CORO_NUM; intptr_t idx = (intptr_t)arg; int mListIdx = idx*listsPerCoro + (idx>=remainder ? remainder : idx); int nextListIdx = mListIdx + listsPerCoro + (idx>=remainder ? 0 : 1); List* localList; // int64_t accum = 0; int64_t times = 0; // TODO: tracing for (int j = mListIdx; j < nextListIdx; j++) { localList = head[j]; while (localList != NULL) { #ifdef DATA_PREFETCH //__builtin_prefetch(localList, PREFETCH_MODE, PREFETCH_LOCALITY); //yield(); __builtin_prefetch(localList->data, PREFETCH_MODE, PREFETCH_LOCALITY); yield(); #endif for (int i = 0; i < REPEAT_TIMES; i++) { for (int k = 0; k < LOCAL_NUM; k++) { accum += localList->data[k]; } /*accum += localList->data[0]; accum += localList->data[1]; accum += localList->data[2]; accum += localList->data[3]; accum += localList->data[4]; accum += localList->data[5]; accum += localList->data[6]; accum += localList->data[7]; accum += localList->data[8]; accum += localList->data[9]; accum += localList->data[10]; accum += localList->data[11]; accum += localList->data[12]; accum += localList->data[13]; */ } times++; localList = localList->next; } } total_accum += accum; tra_times += times; }
void bad (int *p) { __builtin_prefetch (p, -1, 0); /* { dg-warning "invalid second arg to __builtin_prefetch; using zero" } */ __builtin_prefetch (p, 2, 0); /* { dg-warning "invalid second arg to __builtin_prefetch; using zero" } */ __builtin_prefetch (p, bogus, 0); /* { dg-warning "invalid second arg to __builtin_prefetch; using zero" } */ __builtin_prefetch (p, 0, -1); /* { dg-warning "invalid third arg to __builtin_prefetch; using zero" } */ __builtin_prefetch (p, 0, 4); /* { dg-warning "invalid third arg to __builtin_prefetch; using zero" } */ __builtin_prefetch (p, 0, bogus); /* { dg-warning "invalid third arg to __builtin_prefetch; using zero" } */ }
static void pkt_prefetch_etherhdr(struct pkt *pq, int n) { int i, len; const char *buf; for (i = 0; i < n; i++) { buf = pq[i].buf; len = pq[i].len; if (len < 14) continue; /* Pre-fetch the ethertype */ __builtin_prefetch(&buf[14]); } }