sl_enddef sl_def(kernel21, void, sl_glparm(size_t, ncores), sl_glparm(size_t, n) , sl_glparm(const double*restrict, CX) , sl_glparm(size_t, CX_dim0) , sl_glparm(size_t, CX_dim1) , sl_glparm(double*restrict, PX) , sl_glparm(size_t, PX_dim0) , sl_glparm(size_t, PX_dim1) , sl_glparm(const double*restrict, VY) , sl_glparm(size_t, VY_dim0) , sl_glparm(size_t, VY_dim1) ) { assert(sl_getp(PX_dim1) == 25); assert(sl_getp(CX_dim1) == 25); assert(sl_getp(VY_dim0) == 25); assert(sl_getp(VY_dim1) == 25); //create the family of the appropriate size //specified in the 'inner' array sl_create(,, 0, 25*sl_getp(n),1, 0,, cell, sl_glarg(size_t, , sl_getp(n)), sl_glarg(const double*, , sl_getp(VY)), sl_glarg(const double*, , sl_getp(CX)), sl_glarg(double*, , sl_getp(PX))); sl_sync(); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 3 // Inner Product //--------------------------------- //--------------------------------- // q = 0.0; // for ( k=0 ; k<n ; k++ ) // { // q += z[k]*x[k]; // } //--------------------------------- #ifndef NAIVE #define REDUCTIONS #endif sl_def(innerk3, void, sl_shfparm(double, Q), sl_glparm(const double*restrict, Z), sl_glparm(const double*restrict, X)) { sl_index(i); sl_setp(Q, (sl_getp(Z)[i] * sl_getp(X)[i]) + sl_getp(Q)); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 5 // tri-diagonal // elimination, below diagonal //--------------------------------- //--------------------------------- // for ( i=1 ; i<n ; i++ ) // { // x[i] = z[i]*( y[i] - x[i-1] ); // } //--------------------------------- sl_def(innerk5,void, sl_shfparm(double, prevx), sl_glparm(double*restrict, X), sl_glparm(const double*restrict, Y), sl_glparm(const double*restrict, Z)) { sl_index(i); double newx = sl_getp(Z)[i] * (sl_getp(Y)[i] - sl_getp(prevx)); sl_setp(prevx, newx); sl_getp(X)[i] = newx; }
sl_def(copy_array, void, sl_glparm(INT*, destination), sl_glparm(INT*, source)) { sl_index(i); sl_getp(destination)[i] = sl_getp(source)[i]; }
sl_enddef #ifdef REDUCTIONS // method to perform a graph reduction of the above dependent kernel over CORES sl_def(reductionk3, void, sl_shfparm(double, Q), sl_glparm(const double*restrict, Z), sl_glparm(const double*restrict, X), sl_glparm(long, iternum)) { sl_index(redindex); long lower = sl_getp(iternum) * redindex; long upper = lower + sl_getp(iternum); sl_create(,PLACE_LOCAL, lower, upper, 1,,, innerk3, sl_shfarg(double, Qr, 0.0), sl_glarg(const double*, , sl_getp(Z)), sl_glarg(const double*, , sl_getp(X))); sl_sync(); //now accumilate the results sl_setp(Q, sl_geta(Qr) + sl_getp(Q) ); }
sl_enddef sl_def(buf_copy, void, sl_glparm(const uint32_t*restrict, src), sl_glparm(uint32_t*restrict, dst)) { sl_index(i); sl_getp(dst)[i] = sl_getp(src)[i]; }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 21 // Matrix*Matrix Product //--------------------------------- //--------------------------------- // 'Original' C: // for ( k=0 ; k<25 ; k++ ) // { // for ( i=0 ; i<25 ; i++ ) // { // for ( j=0 ; j<n ; j++ ) // { // px[j][i] += vy[k][i] * cx[j][k]; // } // } // } //--------------------------------- //cell to calculate is computed by taking the remainder //and result of dividing the thread index by the matrix //width. sl_def(cell, void, sl_glparm(size_t, n), sl_glparm(const double*restrict, VY), sl_glparm(const double*restrict, CX), sl_glparm(double*restrict, PX)) { sl_index(ij); const size_t n = sl_getp(n); double (*restrict PX)[n][25] = (double (*)[n][25])(double*)sl_getp(PX); const double (*restrict CX)[n][25] = (const double (*)[n][25])(const double*)sl_getp(CX); const double (*restrict VY)[25][25] = (const double (*)[25][25])(const double*)sl_getp(VY); long i = ij % 25; long j = ij / 25; //N.B. can easily make the following into a new family, //but requires a reduction over the number of cores long k; double px_ij = (*PX)[j][i]; for (k = 0; k < 25; ++k) px_ij += (*VY)[k][i] * (*CX)[j][k]; //save result (*PX)[j][i] = px_ij; }
sl_def(fibo_compute, void, sl_shparm(INT, prev), sl_shparm(INT, prev2), sl_glparm(INT*, fibo)) { sl_index(i); INT n = sl_getp(prev) + sl_getp(prev2); sl_setp(prev2, sl_getp(prev)); sl_setp(prev, n); sl_getp(fibo)[i] = n; }
sl_def(icount, void, sl_shparm(unsigned, count), sl_glparm(unsigned, max)) { sl_index(i); if (sl_getp(count) >= sl_getp(max)) { sl_setp(count, sl_getp(count)); sl_break ; } indices[sl_getp(count)] = i; sl_setp(count, sl_getp(count) + 1); }
sl_enddef /* Further partitions a list of intervals * level: level of the current list to be partitioned */ sl_def(partition_list_of_intervals, void, sl_glparm(INT*, data), sl_glparm(SIZE, len), sl_glparm(INT*, scratch), sl_shparm(INT, level), sl_shparm(INT, done) ) { sl_index(i); INT done = sl_getp(done); if (!done) { INT* d = sl_getp(data); SIZE l = sl_getp(len); INT level = sl_getp(level); INT* scratch = sl_getp(scratch); //printf("PARTIITON LIST: level = %d\n", level); num_intervals[(level+1)%2] = 0; // partition the intervals sl_create(,,0,num_intervals[level%2],,,,partition_interval, sl_glarg(INT*, gdata, d), sl_glarg(INT*, gres, scratch), sl_glarg(SIZE, level, level), sl_sharg(int, done, 1)); sl_sync(); // copy partitoned values back to d sl_create(,,0,l,,,, copy_array, sl_glarg(INT*, gdestination, d), sl_glarg(INT*, gsource, scratch)); sl_sync(); int j = 0; if (num_intervals[(level+1)%2] == 0) { // no intervals for the next level => we're done sl_setp(level, 0); // value doesn't matter, just unblock sibling sl_setp(done, 1); } else { // trigger the next sibling to start sl_setp(level, level+1); sl_setp(done, 0); } } else { // if (!done)
// done: used by a thread to signal it's right sibling when it finished sl_def(partition_interval, void, sl_glparm(INT*, data), sl_glparm(INT*, result), sl_glparm(SIZE, level), sl_shparm(int, done) ) { sl_index(i); INT* data = sl_getp(data); INT* result = sl_getp(result); SIZE level = sl_getp(level); int l = intervals[level%2][i].l; int r = intervals[level%2][i].r; if (l == r) { // this interval is sorted (1 element), so don't copy it to the // next level result[l] = data[l]; sl_setp(done, sl_getp(done)); } else { sl_create(,,1,r - l + 1,,,, do_partition_interval, sl_glarg(INT*, gdata, data + l), sl_glarg(INT*, gres, result + l), sl_sharg(SIZE, lower, 0), sl_sharg(SIZE, greater, r - l)); sl_sync(); SIZE la = sl_geta(lower); la = la + l; result[la] = data[l]; // put the pivot in the right place // copy the 2 new intervals to next level // but after the left sibling has done doing the same int left_done = sl_getp(done); workaround += left_done; // use the value, so the read doesn't get // optimized away. See comment for workaround. if (l < la) { // don't copy an interval of len=1 intervals[(level+1)%2][num_intervals[(level+1) % 2]].l = l; intervals[(level+1)%2][num_intervals[(level+1) % 2]].r = la; num_intervals[(level+1) % 2]++; } if (la + 1 < r) { // don't copy an interval of len=1 intervals[(level+1)%2][num_intervals[(level+1) % 2]].l = la + 1; intervals[(level+1)%2][num_intervals[(level+1) % 2]].r = r; num_intervals[(level+1) % 2]++; } //signal to the right sibling that I'm done sl_setp(done, 0); } }
sl_enddef sl_def(fibo_print, void, sl_shparm(INT, guard), sl_glparm(INT*, fibo)) { sl_index(i); INT p1 = sl_getp(fibo)[i - 2]; INT p2 = sl_getp(fibo)[i - 1]; INT p3 = sl_getp(fibo)[i]; INT n = sl_getp(guard); printf("The %luth Fibonacci number is %lu + %lu = %lu\n", (INT)i, p1, p2, p3); sl_setp(guard, n); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //---------------------------------------- // LIVERMORE KERNEL 2 // Incomplete Cholesky Conjugate Gradient //---------------------------------------- //--------------------------------- // ii = n; // ipntp = 0; // do // { // ipnt = ipntp; // ipntp += ii; // ii /= 2; // i = ipntp; // for ( k=ipnt+1 ; k<ipntp ; k=k+2 ) // { // i++; // x[i] = x[k] - v[k]*x[k-1] - v[k+1]*x[k+1]; // } // } while ( ii>0 ); //--------------------------------- sl_def(innerk2,void, sl_glparm(double*restrict, X), sl_glparm(const double*restrict, V), sl_glparm(unsigned long, ipnt), sl_glparm(unsigned long, ipntp)) { sl_index(i); unsigned long ipnt = sl_getp(ipnt); unsigned long ipntp = sl_getp(ipntp); unsigned long k = ipnt + i; double*restrict X = sl_getp(X); const double*restrict V = sl_getp(V); // output_uint(k,2); output_char('\n',2); // output_int(ipntp + i / 2, 2); output_char('\n',2); X[ipntp + i / 2] = X[k] - V[k] * X[k-1] - V[k+1] * X[k+1]; }
sl_def(bar, void, sl_shparm(int, x)) { sl_index(i); int a[90000]; a[42] = 123; sl_setp(x, sl_getp(x) + foo(a) - i); }
sl_def(foo,void,sl_glparm(int, x)) { if (sl_getp(x) > 0) { sl_create(,,,,,,,foo,sl_glarg(int,,sl_getp(x)-1)); sl_sync(); } else {
sl_def(synch_thread, void, sl_shparm(int, s)) { int temp; temp = sl_getp(s); output_int(temp, 1); sl_setp(s, temp + 1); }
sl_def(foo, void, sl_shparm(int, token)) { int token = sl_getp(token); (void)bar(""); sl_index(i); output_int(i, 1); output_char('\n', 1); sl_setp(token, token); }
sl_def (thread, void, sl_shparm(int, _s)) { int s; sl_index(x); s = sl_getp(_s); s = x + s; sl_setp(_s, s); }
sl_def(foo,void,sl_glparm(int, x)) { if (sl_getp(x) > 0) { sl_spawndecl(f); sl_spawn(f,,,,,,,foo,sl_glarg(int,,sl_getp(x)-1)); sl_spawnsync(f); } else {
sl_enddef // partitions an interval using data[0] as the pivot // After all threads run, the value of "lower" reported to the parent // is the position where the pivot should stay. sl_def(do_partition_interval, void, sl_glparm(INT*, data), sl_glparm(INT*, result), sl_shparm(SIZE, lower), sl_shparm(SIZE, greater)) { sl_index(i); INT* d = sl_getp(data); if (d[i] < d[0]) { sl_setp(greater, sl_getp(greater)); SIZE l1 = sl_getp(lower); sl_setp(lower, l1+1); sl_getp(result)[l1] = d[i]; } else { sl_setp(lower, sl_getp(lower)); SIZE l2 = sl_getp(greater); sl_setp(greater, l2-1); sl_getp(result)[l2] = d[i]; } }
sl_def(do_print, void, sl_shparm(long, tok)) { sl_index(i); const char *ptr = (const char*)(void*)(long)i; char c = *ptr; long t = sl_getp(tok); output_char(c, 1); sl_setp(tok, t); }
sl_def(foo, void, sl_shfparm(double, sarg)) { sl_index(i); double x = sl_getp(sarg); double x2 = x; if (i == 0) sl_setp(sarg, x2); else sl_setp(sarg, x2+1); }
sl_enddef sl_def(iprint, int, sl_shparm(unsigned, count), sl_glparm(unsigned, refcount)) { sl_index(i); unsigned c = sl_getp(count); if (c >= sl_getp(refcount)) { sl_setp(count, c); sl_break ; } output_int(c, 1); output_char(' ', 1); output_int(indices[c], 1); output_char('\n', 1); sl_setp(count, c + 1); }
sl_enddef sl_def(outerk2, void, sl_glparm(double*restrict, X), sl_glparm(const double*restrict, V), sl_shparm(unsigned long, ii), sl_shparm(unsigned long, ipntp)) { sl_index(m); unsigned long ipnt, ii; unsigned long ipntp = (ii = sl_getp(ii)) + (ipnt = sl_getp(ipntp)); sl_setp(ii, ii/2); sl_create(,,1,ii,2,,, innerk2, sl_glarg(double*restrict, , sl_getp(X)), sl_glarg(const double*restrict, , sl_getp(V)), sl_glarg(unsigned long, , ipnt), sl_glarg(unsigned long, , ipntp)); sl_sync(); sl_setp(ipntp, ipntp); }
sl_def(sha_main_inner, void, sl_glparm(const uint32_t*restrict, w), sl_shparm(unsigned long, a), sl_shparm(unsigned long, b), sl_shparm(unsigned long, c), sl_shparm(unsigned long, d), sl_shparm(unsigned long, e)) { sl_index(i); uint32_t d = sl_getp(d); uint32_t e = sl_getp(e); sl_setp(e, d); uint32_t c = sl_getp(c); sl_setp(d, c); uint32_t b = sl_getp(b); sl_setp(c, ROL32(b, 30)); uint32_t a = sl_getp(a); sl_setp(b, a); uint32_t tmp = ROL32(a, 5) + e + sl_getp(w)[i]; if (i < 20) { tmp += (b & c) | ((~b) & d); tmp += 0x5A827999L; } else if (i < 40) { tmp += (b ^ c ^ d); tmp += 0x6ED9EBA1L; } else if (i < 60) { tmp += (b & c) | (b & d) | (c & d); tmp += 0x8F1BBCDCL; } else { tmp += (b ^ c ^ d); tmp += 0xCA62C1D6L; } sl_setp(a, tmp); }
sl_def(roman, void, sl_glparm(short, x)) { long num = sl_getp(x); if (unlikely(num < 0)) { output_char('-', 1); num = -num; } struct roman_table_t *p = roman_table; const char *s; for (p = roman_table; p->base; ++p) while(likely(num >= p->base)) { for (s = p->repr; *s; ++s) output_char(*s, 1); num = num - p->base; }; }
sl_enddef /** GOL **/ sl_def(gol,void,sl_shparm(int,breaked)) { sl_index(index); int flag = sl_getp(breaked); if(flag==0) { //info_print("GOL Thread %d :Processing %d blocks\n", index, b_queue->elements); //printf("GOL Thread %d :Processing %d blocks\n", index, b_queue->elements); //info_print("%d\n",iter); if( index + 1 == cycle) flag = 1; debug_print("Creating Worker family...\n"); sl_create(,,,,0,block_size,,run,sl_glarg(int,iteration,index),sl_sharg(int,sta,0)); debug_print("Waiting for sync...\n"); sl_sync(); debug_print("Workers finished, processing request queue...\n"); sl_create(,,,,0,block_size,,process_requests,sl_sharg(int,stat,0)); debug_print("Waiting for sync...\n"); sl_sync(); debug_print("Request queue processing finished, traversing...\n"); b_queue->elements = 0; sl_create(,,,,0,block_size,,traverse,sl_sharg(int,state,1),sl_sharg(struct hashtable_itr*,itr,hashtable_iterator(table))); debug_print("Waiting for sync...\n"); sl_sync(); sl_setp(breaked,flag); if(flag == 1) sl_break; } else {
sl_enddef #include <math.h> sl_def(kernel2, void, sl_glparm(size_t, ncores), sl_glparm(size_t, n), sl_glparm(const double*restrict, V), sl_glparm(size_t, V_dim), sl_glparm(double*restrict, X), sl_glparm(size_t, X_dim)) { // output_int(sl_getp(n), 2); output_char('\n', 2); unsigned long upper = log2(sl_getp(n)); sl_create(,,upper,-1,-1,2,, outerk2, sl_glarg(double*restrict, , sl_getp(X)), sl_glarg(const double*restrict, , sl_getp(V)), sl_sharg(unsigned long, ii, sl_getp(n)), sl_sharg(unsigned long, ipntp, 0)); sl_sync(); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 1 // Hydro Fragment //--------------------------------- //--------------------------------- // 'Original' C: // for ( k=0 ; k<n ; k++ ) { // x[k] = q + y[k]*( r*z[k+10] + t*z[k+11] ); // } //--------------------------------- //Break down kernel into two families //this one does the 'meat' sl_def(innerk1, void, sl_glparm(double*restrict, X), sl_glfparm(double, Q), sl_glparm(const double*restrict, Y), sl_glfparm(double, R), sl_glparm(const double*restrict, ZX), sl_glfparm(double, T) ) { sl_index(i); //now the actual calculation sl_getp(X)[i] = sl_getp(Q) + sl_getp(Y)[i] * ( sl_getp(R) * sl_getp(ZX)[i+10] + sl_getp(T) * sl_getp(ZX)[i+11] ); }
sl_def(foo, void, sl_shparm(int, a)) { sl_setp(a, sl_getp(a) + 1); }