sl_def(sha_main_inner, void, sl_glparm(const uint32_t*restrict, w), sl_shparm(unsigned long, a), sl_shparm(unsigned long, b), sl_shparm(unsigned long, c), sl_shparm(unsigned long, d), sl_shparm(unsigned long, e)) { sl_index(i); uint32_t d = sl_getp(d); uint32_t e = sl_getp(e); sl_setp(e, d); uint32_t c = sl_getp(c); sl_setp(d, c); uint32_t b = sl_getp(b); sl_setp(c, ROL32(b, 30)); uint32_t a = sl_getp(a); sl_setp(b, a); uint32_t tmp = ROL32(a, 5) + e + sl_getp(w)[i]; if (i < 20) { tmp += (b & c) | ((~b) & d); tmp += 0x5A827999L; } else if (i < 40) { tmp += (b ^ c ^ d); tmp += 0x6ED9EBA1L; } else if (i < 60) { tmp += (b & c) | (b & d) | (c & d); tmp += 0x8F1BBCDCL; } else { tmp += (b ^ c ^ d); tmp += 0xCA62C1D6L; } sl_setp(a, tmp); }
sl_enddef // partitions an interval using data[0] as the pivot // After all threads run, the value of "lower" reported to the parent // is the position where the pivot should stay. sl_def(do_partition_interval, void, sl_glparm(INT*, data), sl_glparm(INT*, result), sl_shparm(SIZE, lower), sl_shparm(SIZE, greater)) { sl_index(i); INT* d = sl_getp(data); if (d[i] < d[0]) { sl_setp(greater, sl_getp(greater)); SIZE l1 = sl_getp(lower); sl_setp(lower, l1+1); sl_getp(result)[l1] = d[i]; } else { sl_setp(lower, sl_getp(lower)); SIZE l2 = sl_getp(greater); sl_setp(greater, l2-1); sl_getp(result)[l2] = d[i]; } }
sl_def(foo, void, sl_shfparm(double, sarg)) { sl_index(i); double x = sl_getp(sarg); double x2 = x; if (i == 0) sl_setp(sarg, x2); else sl_setp(sarg, x2+1); }
sl_def(fibo_compute, void, sl_shparm(INT, prev), sl_shparm(INT, prev2), sl_glparm(INT*, fibo)) { sl_index(i); INT n = sl_getp(prev) + sl_getp(prev2); sl_setp(prev2, sl_getp(prev)); sl_setp(prev, n); sl_getp(fibo)[i] = n; }
sl_def(icount, void, sl_shparm(unsigned, count), sl_glparm(unsigned, max)) { sl_index(i); if (sl_getp(count) >= sl_getp(max)) { sl_setp(count, sl_getp(count)); sl_break ; } indices[sl_getp(count)] = i; sl_setp(count, sl_getp(count) + 1); }
// done: used by a thread to signal it's right sibling when it finished sl_def(partition_interval, void, sl_glparm(INT*, data), sl_glparm(INT*, result), sl_glparm(SIZE, level), sl_shparm(int, done) ) { sl_index(i); INT* data = sl_getp(data); INT* result = sl_getp(result); SIZE level = sl_getp(level); int l = intervals[level%2][i].l; int r = intervals[level%2][i].r; if (l == r) { // this interval is sorted (1 element), so don't copy it to the // next level result[l] = data[l]; sl_setp(done, sl_getp(done)); } else { sl_create(,,1,r - l + 1,,,, do_partition_interval, sl_glarg(INT*, gdata, data + l), sl_glarg(INT*, gres, result + l), sl_sharg(SIZE, lower, 0), sl_sharg(SIZE, greater, r - l)); sl_sync(); SIZE la = sl_geta(lower); la = la + l; result[la] = data[l]; // put the pivot in the right place // copy the 2 new intervals to next level // but after the left sibling has done doing the same int left_done = sl_getp(done); workaround += left_done; // use the value, so the read doesn't get // optimized away. See comment for workaround. if (l < la) { // don't copy an interval of len=1 intervals[(level+1)%2][num_intervals[(level+1) % 2]].l = l; intervals[(level+1)%2][num_intervals[(level+1) % 2]].r = la; num_intervals[(level+1) % 2]++; } if (la + 1 < r) { // don't copy an interval of len=1 intervals[(level+1)%2][num_intervals[(level+1) % 2]].l = la + 1; intervals[(level+1)%2][num_intervals[(level+1) % 2]].r = r; num_intervals[(level+1) % 2]++; } //signal to the right sibling that I'm done sl_setp(done, 0); } }
sl_enddef /* Further partitions a list of intervals * level: level of the current list to be partitioned */ sl_def(partition_list_of_intervals, void, sl_glparm(INT*, data), sl_glparm(SIZE, len), sl_glparm(INT*, scratch), sl_shparm(INT, level), sl_shparm(INT, done) ) { sl_index(i); INT done = sl_getp(done); if (!done) { INT* d = sl_getp(data); SIZE l = sl_getp(len); INT level = sl_getp(level); INT* scratch = sl_getp(scratch); //printf("PARTIITON LIST: level = %d\n", level); num_intervals[(level+1)%2] = 0; // partition the intervals sl_create(,,0,num_intervals[level%2],,,,partition_interval, sl_glarg(INT*, gdata, d), sl_glarg(INT*, gres, scratch), sl_glarg(SIZE, level, level), sl_sharg(int, done, 1)); sl_sync(); // copy partitoned values back to d sl_create(,,0,l,,,, copy_array, sl_glarg(INT*, gdestination, d), sl_glarg(INT*, gsource, scratch)); sl_sync(); int j = 0; if (num_intervals[(level+1)%2] == 0) { // no intervals for the next level => we're done sl_setp(level, 0); // value doesn't matter, just unblock sibling sl_setp(done, 1); } else { // trigger the next sibling to start sl_setp(level, level+1); sl_setp(done, 0); } } else { // if (!done)
sl_def(bar, void, sl_shparm(int, x)) { sl_index(i); int a[90000]; a[42] = 123; sl_setp(x, sl_getp(x) + foo(a) - i); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 3 // Inner Product //--------------------------------- //--------------------------------- // q = 0.0; // for ( k=0 ; k<n ; k++ ) // { // q += z[k]*x[k]; // } //--------------------------------- #ifndef NAIVE #define REDUCTIONS #endif sl_def(innerk3, void, sl_shfparm(double, Q), sl_glparm(const double*restrict, Z), sl_glparm(const double*restrict, X)) { sl_index(i); sl_setp(Q, (sl_getp(Z)[i] * sl_getp(X)[i]) + sl_getp(Q)); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 5 // tri-diagonal // elimination, below diagonal //--------------------------------- //--------------------------------- // for ( i=1 ; i<n ; i++ ) // { // x[i] = z[i]*( y[i] - x[i-1] ); // } //--------------------------------- sl_def(innerk5,void, sl_shfparm(double, prevx), sl_glparm(double*restrict, X), sl_glparm(const double*restrict, Y), sl_glparm(const double*restrict, Z)) { sl_index(i); double newx = sl_getp(Z)[i] * (sl_getp(Y)[i] - sl_getp(prevx)); sl_setp(prevx, newx); sl_getp(X)[i] = newx; }
sl_enddef #ifdef REDUCTIONS // method to perform a graph reduction of the above dependent kernel over CORES sl_def(reductionk3, void, sl_shfparm(double, Q), sl_glparm(const double*restrict, Z), sl_glparm(const double*restrict, X), sl_glparm(long, iternum)) { sl_index(redindex); long lower = sl_getp(iternum) * redindex; long upper = lower + sl_getp(iternum); sl_create(,PLACE_LOCAL, lower, upper, 1,,, innerk3, sl_shfarg(double, Qr, 0.0), sl_glarg(const double*, , sl_getp(Z)), sl_glarg(const double*, , sl_getp(X))); sl_sync(); //now accumilate the results sl_setp(Q, sl_geta(Qr) + sl_getp(Q) ); }
sl_def (thread, void, sl_shparm(int, _s)) { int s; sl_index(x); s = sl_getp(_s); s = x + s; sl_setp(_s, s); }
sl_def(synch_thread, void, sl_shparm(int, s)) { int temp; temp = sl_getp(s); output_int(temp, 1); sl_setp(s, temp + 1); }
sl_def(foo, void, sl_shparm(int, token)) { int token = sl_getp(token); (void)bar(""); sl_index(i); output_int(i, 1); output_char('\n', 1); sl_setp(token, token); }
sl_enddef sl_def(sha_main_outer, void, sl_glparm(const uint32_t*restrict, input), sl_shparm(unsigned long, h0), sl_shparm(unsigned long, h1), sl_shparm(unsigned long, h2), sl_shparm(unsigned long, h3), sl_shparm(unsigned long, h4)) { sl_index(offset_base); int i; const uint32_t*restrict input = sl_getp(input) + offset_base; /* word extension: not easily made concurrent! */ uint32_t w[80]; sl_create(,PLACE_LOCAL,,16,,,, buf_copy, sl_glarg(const uint32_t*restrict, src, input), sl_glarg(uint32_t*restrict, dst, w)); sl_sync(); // for (i = 0; i < 16; ++i) w[i] = input[i]; for (i = 16; i < 80; ++i) { uint32_t x = w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]; w[i] = ROL32(x, 1); } sl_create(,,,80,,,, sha_main_inner, sl_glarg(const uint32_t*restrict, wg, w), sl_sharg(unsigned long, a), sl_sharg(unsigned long, b), sl_sharg(unsigned long, c), sl_sharg(unsigned long, d), sl_sharg(unsigned long, e)); sl_seta(a, sl_getp(h0)); sl_seta(b, sl_getp(h1)); sl_seta(c, sl_getp(h2)); sl_seta(d, sl_getp(h3)); sl_seta(e, sl_getp(h4)); sl_sync(); sl_setp(h0, sl_getp(h0) + sl_geta(a)); sl_setp(h1, sl_getp(h1) + sl_geta(b)); sl_setp(h2, sl_getp(h2) + sl_geta(c)); sl_setp(h3, sl_getp(h3) + sl_geta(d)); sl_setp(h4, sl_getp(h4) + sl_geta(e)); }
sl_def(do_print, void, sl_shparm(long, tok)) { sl_index(i); const char *ptr = (const char*)(void*)(long)i; char c = *ptr; long t = sl_getp(tok); output_char(c, 1); sl_setp(tok, t); }
sl_enddef sl_def(iprint, int, sl_shparm(unsigned, count), sl_glparm(unsigned, refcount)) { sl_index(i); unsigned c = sl_getp(count); if (c >= sl_getp(refcount)) { sl_setp(count, c); sl_break ; } output_int(c, 1); output_char(' ', 1); output_int(indices[c], 1); output_char('\n', 1); sl_setp(count, c + 1); }
sl_enddef sl_def(outerk2, void, sl_glparm(double*restrict, X), sl_glparm(const double*restrict, V), sl_shparm(unsigned long, ii), sl_shparm(unsigned long, ipntp)) { sl_index(m); unsigned long ipnt, ii; unsigned long ipntp = (ii = sl_getp(ii)) + (ipnt = sl_getp(ipntp)); sl_setp(ii, ii/2); sl_create(,,1,ii,2,,, innerk2, sl_glarg(double*restrict, , sl_getp(X)), sl_glarg(const double*restrict, , sl_getp(V)), sl_glarg(unsigned long, , ipnt), sl_glarg(unsigned long, , ipntp)); sl_sync(); sl_setp(ipntp, ipntp); }
sl_enddef sl_def(fibo_print, void, sl_shparm(INT, guard), sl_glparm(INT*, fibo)) { sl_index(i); INT p1 = sl_getp(fibo)[i - 2]; INT p2 = sl_getp(fibo)[i - 1]; INT p3 = sl_getp(fibo)[i]; INT n = sl_getp(guard); printf("The %luth Fibonacci number is %lu + %lu = %lu\n", (INT)i, p1, p2, p3); sl_setp(guard, n); }
sl_enddef /** GOL **/ sl_def(gol,void,sl_shparm(int,breaked)) { sl_index(index); int flag = sl_getp(breaked); if(flag==0) { //info_print("GOL Thread %d :Processing %d blocks\n", index, b_queue->elements); //printf("GOL Thread %d :Processing %d blocks\n", index, b_queue->elements); //info_print("%d\n",iter); if( index + 1 == cycle) flag = 1; debug_print("Creating Worker family...\n"); sl_create(,,,,0,block_size,,run,sl_glarg(int,iteration,index),sl_sharg(int,sta,0)); debug_print("Waiting for sync...\n"); sl_sync(); debug_print("Workers finished, processing request queue...\n"); sl_create(,,,,0,block_size,,process_requests,sl_sharg(int,stat,0)); debug_print("Waiting for sync...\n"); sl_sync(); debug_print("Request queue processing finished, traversing...\n"); b_queue->elements = 0; sl_create(,,,,0,block_size,,traverse,sl_sharg(int,state,1),sl_sharg(struct hashtable_itr*,itr,hashtable_iterator(table))); debug_print("Waiting for sync...\n"); sl_sync(); sl_setp(breaked,flag); if(flag == 1) sl_break; } else {
sl_def(foo, void, sl_shparm(int, a)) { sl_setp(a, sl_getp(a) + 1); }
sl_def(bar, void, sl_shparm(int, x)) { sl_setp(x, sl_getp(x)+1); }
sl_def(a, void, sl_shparm(int, x), sl_shparm(int, v)) { bla = sl_getp(x); sl_setp(v, sl_getp(v) + bla); }
sl_def(foo, void, sl_shparm(int, a)) { sl_setp(a, sl_getp(a) + 1); putchar('.'); }
sl_def(foo, void, sl_shfparm(double, a)) { sl_setp(a, sl_getp(a) + 1.0); }
sl_def(foo, void, sl_shparm(int, x)) { sl_setp(x, sl_getp(x)); }