sl_def(do_print, void, sl_shparm(long, tok)) { sl_index(i); const char *ptr = (const char*)(void*)(long)i; char c = *ptr; long t = sl_getp(tok); output_char(c, 1); sl_setp(tok, t); }
sl_enddef sl_def(buf_copy, void, sl_glparm(const uint32_t*restrict, src), sl_glparm(uint32_t*restrict, dst)) { sl_index(i); sl_getp(dst)[i] = sl_getp(src)[i]; }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 21 // Matrix*Matrix Product //--------------------------------- //--------------------------------- // 'Original' C: // for ( k=0 ; k<25 ; k++ ) // { // for ( i=0 ; i<25 ; i++ ) // { // for ( j=0 ; j<n ; j++ ) // { // px[j][i] += vy[k][i] * cx[j][k]; // } // } // } //--------------------------------- //cell to calculate is computed by taking the remainder //and result of dividing the thread index by the matrix //width. sl_def(cell, void, sl_glparm(size_t, n), sl_glparm(const double*restrict, VY), sl_glparm(const double*restrict, CX), sl_glparm(double*restrict, PX)) { sl_index(ij); const size_t n = sl_getp(n); double (*restrict PX)[n][25] = (double (*)[n][25])(double*)sl_getp(PX); const double (*restrict CX)[n][25] = (const double (*)[n][25])(const double*)sl_getp(CX); const double (*restrict VY)[25][25] = (const double (*)[25][25])(const double*)sl_getp(VY); long i = ij % 25; long j = ij / 25; //N.B. can easily make the following into a new family, //but requires a reduction over the number of cores long k; double px_ij = (*PX)[j][i]; for (k = 0; k < 25; ++k) px_ij += (*VY)[k][i] * (*CX)[j][k]; //save result (*PX)[j][i] = px_ij; }
sl_def(foo, void, sl_shfparm(double, sarg)) { sl_index(i); double x = sl_getp(sarg); double x2 = x; if (i == 0) sl_setp(sarg, x2); else sl_setp(sarg, x2+1); }
sl_def(fibo_compute, void, sl_shparm(INT, prev), sl_shparm(INT, prev2), sl_glparm(INT*, fibo)) { sl_index(i); INT n = sl_getp(prev) + sl_getp(prev2); sl_setp(prev2, sl_getp(prev)); sl_setp(prev, n); sl_getp(fibo)[i] = n; }
sl_def(icount, void, sl_shparm(unsigned, count), sl_glparm(unsigned, max)) { sl_index(i); if (sl_getp(count) >= sl_getp(max)) { sl_setp(count, sl_getp(count)); sl_break ; } indices[sl_getp(count)] = i; sl_setp(count, sl_getp(count) + 1); }
// done: used by a thread to signal it's right sibling when it finished sl_def(partition_interval, void, sl_glparm(INT*, data), sl_glparm(INT*, result), sl_glparm(SIZE, level), sl_shparm(int, done) ) { sl_index(i); INT* data = sl_getp(data); INT* result = sl_getp(result); SIZE level = sl_getp(level); int l = intervals[level%2][i].l; int r = intervals[level%2][i].r; if (l == r) { // this interval is sorted (1 element), so don't copy it to the // next level result[l] = data[l]; sl_setp(done, sl_getp(done)); } else { sl_create(,,1,r - l + 1,,,, do_partition_interval, sl_glarg(INT*, gdata, data + l), sl_glarg(INT*, gres, result + l), sl_sharg(SIZE, lower, 0), sl_sharg(SIZE, greater, r - l)); sl_sync(); SIZE la = sl_geta(lower); la = la + l; result[la] = data[l]; // put the pivot in the right place // copy the 2 new intervals to next level // but after the left sibling has done doing the same int left_done = sl_getp(done); workaround += left_done; // use the value, so the read doesn't get // optimized away. See comment for workaround. if (l < la) { // don't copy an interval of len=1 intervals[(level+1)%2][num_intervals[(level+1) % 2]].l = l; intervals[(level+1)%2][num_intervals[(level+1) % 2]].r = la; num_intervals[(level+1) % 2]++; } if (la + 1 < r) { // don't copy an interval of len=1 intervals[(level+1)%2][num_intervals[(level+1) % 2]].l = la + 1; intervals[(level+1)%2][num_intervals[(level+1) % 2]].r = r; num_intervals[(level+1) % 2]++; } //signal to the right sibling that I'm done sl_setp(done, 0); } }
sl_enddef /* Further partitions a list of intervals * level: level of the current list to be partitioned */ sl_def(partition_list_of_intervals, void, sl_glparm(INT*, data), sl_glparm(SIZE, len), sl_glparm(INT*, scratch), sl_shparm(INT, level), sl_shparm(INT, done) ) { sl_index(i); INT done = sl_getp(done); if (!done) { INT* d = sl_getp(data); SIZE l = sl_getp(len); INT level = sl_getp(level); INT* scratch = sl_getp(scratch); //printf("PARTIITON LIST: level = %d\n", level); num_intervals[(level+1)%2] = 0; // partition the intervals sl_create(,,0,num_intervals[level%2],,,,partition_interval, sl_glarg(INT*, gdata, d), sl_glarg(INT*, gres, scratch), sl_glarg(SIZE, level, level), sl_sharg(int, done, 1)); sl_sync(); // copy partitoned values back to d sl_create(,,0,l,,,, copy_array, sl_glarg(INT*, gdestination, d), sl_glarg(INT*, gsource, scratch)); sl_sync(); int j = 0; if (num_intervals[(level+1)%2] == 0) { // no intervals for the next level => we're done sl_setp(level, 0); // value doesn't matter, just unblock sibling sl_setp(done, 1); } else { // trigger the next sibling to start sl_setp(level, level+1); sl_setp(done, 0); } } else { // if (!done)
sl_enddef sl_def(fibo_print, void, sl_shparm(INT, guard), sl_glparm(INT*, fibo)) { sl_index(i); INT p1 = sl_getp(fibo)[i - 2]; INT p2 = sl_getp(fibo)[i - 1]; INT p3 = sl_getp(fibo)[i]; INT n = sl_getp(guard); printf("The %luth Fibonacci number is %lu + %lu = %lu\n", (INT)i, p1, p2, p3); sl_setp(guard, n); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //---------------------------------------- // LIVERMORE KERNEL 2 // Incomplete Cholesky Conjugate Gradient //---------------------------------------- //--------------------------------- // ii = n; // ipntp = 0; // do // { // ipnt = ipntp; // ipntp += ii; // ii /= 2; // i = ipntp; // for ( k=ipnt+1 ; k<ipntp ; k=k+2 ) // { // i++; // x[i] = x[k] - v[k]*x[k-1] - v[k+1]*x[k+1]; // } // } while ( ii>0 ); //--------------------------------- sl_def(innerk2,void, sl_glparm(double*restrict, X), sl_glparm(const double*restrict, V), sl_glparm(unsigned long, ipnt), sl_glparm(unsigned long, ipntp)) { sl_index(i); unsigned long ipnt = sl_getp(ipnt); unsigned long ipntp = sl_getp(ipntp); unsigned long k = ipnt + i; double*restrict X = sl_getp(X); const double*restrict V = sl_getp(V); // output_uint(k,2); output_char('\n',2); // output_int(ipntp + i / 2, 2); output_char('\n',2); X[ipntp + i / 2] = X[k] - V[k] * X[k-1] - V[k+1] * X[k+1]; }
sl_enddef sl_def(sha_main_outer, void, sl_glparm(const uint32_t*restrict, input), sl_shparm(unsigned long, h0), sl_shparm(unsigned long, h1), sl_shparm(unsigned long, h2), sl_shparm(unsigned long, h3), sl_shparm(unsigned long, h4)) { sl_index(offset_base); int i; const uint32_t*restrict input = sl_getp(input) + offset_base; /* word extension: not easily made concurrent! */ uint32_t w[80]; sl_create(,PLACE_LOCAL,,16,,,, buf_copy, sl_glarg(const uint32_t*restrict, src, input), sl_glarg(uint32_t*restrict, dst, w)); sl_sync(); // for (i = 0; i < 16; ++i) w[i] = input[i]; for (i = 16; i < 80; ++i) { uint32_t x = w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]; w[i] = ROL32(x, 1); } sl_create(,,,80,,,, sha_main_inner, sl_glarg(const uint32_t*restrict, wg, w), sl_sharg(unsigned long, a), sl_sharg(unsigned long, b), sl_sharg(unsigned long, c), sl_sharg(unsigned long, d), sl_sharg(unsigned long, e)); sl_seta(a, sl_getp(h0)); sl_seta(b, sl_getp(h1)); sl_seta(c, sl_getp(h2)); sl_seta(d, sl_getp(h3)); sl_seta(e, sl_getp(h4)); sl_sync(); sl_setp(h0, sl_getp(h0) + sl_geta(a)); sl_setp(h1, sl_getp(h1) + sl_geta(b)); sl_setp(h2, sl_getp(h2) + sl_geta(c)); sl_setp(h3, sl_getp(h3) + sl_geta(d)); sl_setp(h4, sl_getp(h4) + sl_geta(e)); }
sl_enddef /** GOL **/ sl_def(gol,void,sl_shparm(int,breaked)) { sl_index(index); int flag = sl_getp(breaked); if(flag==0) { //info_print("GOL Thread %d :Processing %d blocks\n", index, b_queue->elements); //printf("GOL Thread %d :Processing %d blocks\n", index, b_queue->elements); //info_print("%d\n",iter); if( index + 1 == cycle) flag = 1; debug_print("Creating Worker family...\n"); sl_create(,,,,0,block_size,,run,sl_glarg(int,iteration,index),sl_sharg(int,sta,0)); debug_print("Waiting for sync...\n"); sl_sync(); debug_print("Workers finished, processing request queue...\n"); sl_create(,,,,0,block_size,,process_requests,sl_sharg(int,stat,0)); debug_print("Waiting for sync...\n"); sl_sync(); debug_print("Request queue processing finished, traversing...\n"); b_queue->elements = 0; sl_create(,,,,0,block_size,,traverse,sl_sharg(int,state,1),sl_sharg(struct hashtable_itr*,itr,hashtable_iterator(table))); debug_print("Waiting for sync...\n"); sl_sync(); sl_setp(breaked,flag); if(flag == 1) sl_break; } else {
sl_enddef sl_def(iprint, int, sl_shparm(unsigned, count), sl_glparm(unsigned, refcount)) { sl_index(i); unsigned c = sl_getp(count); if (c >= sl_getp(refcount)) { sl_setp(count, c); sl_break ; } output_int(c, 1); output_char(' ', 1); output_int(indices[c], 1); output_char('\n', 1); sl_setp(count, c + 1); }
sl_enddef sl_def(outerk2, void, sl_glparm(double*restrict, X), sl_glparm(const double*restrict, V), sl_shparm(unsigned long, ii), sl_shparm(unsigned long, ipntp)) { sl_index(m); unsigned long ipnt, ii; unsigned long ipntp = (ii = sl_getp(ii)) + (ipnt = sl_getp(ipntp)); sl_setp(ii, ii/2); sl_create(,,1,ii,2,,, innerk2, sl_glarg(double*restrict, , sl_getp(X)), sl_glarg(const double*restrict, , sl_getp(V)), sl_glarg(unsigned long, , ipnt), sl_glarg(unsigned long, , ipntp)); sl_sync(); sl_setp(ipntp, ipntp); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 7 // equation of state // fragment //--------------------------------- //--------------------------------- // for ( k=0 ; k<n ; k++ ) // { // x[k] = u[k] + r*( z[k] + r*y[k] ) + // t*( u[k+3] + r*( u[k+2] + r*u[k+1] ) + // t*( u[k+6] + q*( u[k+5] + q*u[k+4] ) ) ); // } //--------------------------------- //independent loop sl_def(innerk7, void, sl_glparm(double*restrict, X), sl_glparm(const double*restrict, U), sl_glparm(const double*restrict, Z), sl_glparm(const double*restrict, Y), sl_glfparm(double, R), sl_glfparm(double, T), sl_glfparm(double, Q)) { sl_index(k); sl_getp(X)[k] = sl_getp(U)[k ] + sl_getp(R) * ( sl_getp(Z)[k ] + sl_getp(R) * sl_getp(Y)[k ] ) + sl_getp(T) * ( sl_getp(U)[k+3] + sl_getp(R) * ( sl_getp(U)[k+2] + sl_getp(R) * sl_getp(U)[k+1] ) + sl_getp(T) * ( sl_getp(U)[k+6] + sl_getp(Q) * ( sl_getp(U)[k+5] + sl_getp(Q) * sl_getp(U)[k+4] ) ) ); }
[[]] //--------------------------------- // Livemore Loops -- SLC (uTC) // M.A.Hicks, CSA Group, UvA // Implementation based on various // reference implementations // including the original FORTRAN // but mostly from // Roy Longbottom, 1996. //--------------------------------- // LIVERMORE KERNEL 1 // Hydro Fragment //--------------------------------- //--------------------------------- // 'Original' C: // for ( k=0 ; k<n ; k++ ) { // x[k] = q + y[k]*( r*z[k+10] + t*z[k+11] ); // } //--------------------------------- //Break down kernel into two families //this one does the 'meat' sl_def(innerk1, void, sl_glparm(double*restrict, X), sl_glfparm(double, Q), sl_glparm(const double*restrict, Y), sl_glfparm(double, R), sl_glparm(const double*restrict, ZX), sl_glfparm(double, T) ) { sl_index(i); //now the actual calculation sl_getp(X)[i] = sl_getp(Q) + sl_getp(Y)[i] * ( sl_getp(R) * sl_getp(ZX)[i+10] + sl_getp(T) * sl_getp(ZX)[i+11] ); }
sl_def(computeDataT, void, sl_glparm(int*, data), sl_glparm(int, length)) { sl_index(cnt); sl_getp(data)[cnt] = (cnt % 2) ? cnt : sl_getp(length) - cnt; }
sl_def(foo, void) { sl_index(i); }