void test(int M) { /* Original iterators. */ int i, j; S3(2,1) ; S1(3,1) ; S1(4,1) ; S4(4,2) ; for (i=5;i<=M+1;i++) { S1(i,1) ; for (j=2;j<=floord(i-1,2);j++) { S2(i,j) ; } if (i%2 == 0) { S4(i,i/2) ; } } for (i=M+2;i<=2*M-1;i++) { for (j=i-M;j<=floord(i-1,2);j++) { S2(i,j) ; } if (i%2 == 0) { S4(i,i/2) ; } } i = 2*M ; S4(2*M,M) ; }
void foo(int N) { int i; int a[N]; #pragma scop for (i = 0; i < floord(N, 2); ++i) a[i] = i; #pragma endscop }
void test(int M) { /* Scattering iterators. */ int c1, c2, c3, c4; /* Original iterators. */ int i, j, k, l; if (M >= 3) { for (c1=-1;c1<=min(2,floord(M+2,4));c1++) { for (c2=max(ceild(2*c1-M+1,4),ceild(4*c1-M-2,4));c2<=min(0,floord(c1,2));c2++) { for (c3=max(max(-4*c2-2,4*c2+3),4*c1-4*c2+1);c3<=min(min(min(M+3,-4*c2+9),4*c2+2*M),4*c1-4*c2+4);c3++) { for (c4=max(3*c3-4*floord(c3+M+1,2)+6,4*c2-c3-4*floord(-c3+1,4)+2);c4<=min(min(4*c2+4,-c3+10),c3-2);c4+=4) { if ((c2 <= floord(c4-1,4)) && (c2 >= ceild(c4-4,4))) { S1(c1-c2,c2,(c3+c4-2)/4,(c3-c4)/2); } } } } } } }
void test(int n) { /* Scattering iterators. */ int c1, c2; /* Original iterators. */ int i, j; for (c1=0;c1<=5*n;c1++) { for (c2=max(c1-n,ceild(2*c1,3));c2<=min(c1,floord(2*c1+2*n,3));c2++) { if (c2%2 == 0) { i = (-2*c1+3*c2)/2 ; j = c1-c2 ; S1((-2*c1+3*c2)/2,c1-c2) ; } } } }
/* Generated from ../../../git/cloog/test/classen2.cloog by CLooG 0.14.0-271-gaa1e292 gmp bits in 0.14s. */ if ((M >= 2) && (N >= 3) && (outerProcTileScatter1 >= outerProcTileScatter2) && (5*outerProcTileScatter1 <= M+2*N-4) && (5*outerProcTileScatter1 <= 5*outerProcTileScatter2+N+2) && (outerProcTileScatter2 >= 0) && (5*outerProcTileScatter2 <= M+N-2) && (outerTimeTileScatter >= outerProcTileScatter1) && (outerTimeTileScatter <= 2*outerProcTileScatter1) && (outerTimeTileScatter <= outerProcTileScatter1+outerProcTileScatter2+1) && (5*outerTimeTileScatter <= 2*M+2*N-6) && (5*outerTimeTileScatter <= 5*outerProcTileScatter1+M+2) && (5*outerTimeTileScatter >= 10*outerProcTileScatter1-2*N-2) && (5*outerTimeTileScatter <= 5*outerProcTileScatter2+M+N) && (5*outerTimeTileScatter >= 10*outerProcTileScatter2-N-3) && (5*outerTimeTileScatter <= 10*outerProcTileScatter2+N+3) && (5*outerTimeTileScatter >= 5*outerProcTileScatter1+5*outerProcTileScatter2-N-4)) { for (compScatter1=max(max(max(max(max(4,5*outerTimeTileScatter),5*outerProcTileScatter2+1),5*outerProcTileScatter1+5*outerProcTileScatter2-N),10*outerProcTileScatter1-2*N+2),10*outerProcTileScatter2-N+1);compScatter1<=min(min(min(min(min(5*outerTimeTileScatter+4,2*M+2*N-6),5*outerProcTileScatter1+M+2),5*outerProcTileScatter1+5*outerProcTileScatter2+5),5*outerProcTileScatter2+M+N),10*outerProcTileScatter2+N+3);compScatter1++) { for (compScatter2=max(max(max(max(ceild(compScatter1+4,2),5*outerProcTileScatter1),5*outerProcTileScatter2+1),compScatter1-M+2),compScatter1-5*outerProcTileScatter2-1);compScatter2<=min(min(min(min(floord(compScatter1+2*N-2,2),compScatter1),5*outerProcTileScatter1+4),compScatter1-5*outerProcTileScatter2+N),5*outerProcTileScatter2+N+2);compScatter2++) { for (compScatter3=max(max(5*outerProcTileScatter2,compScatter1-compScatter2+3),compScatter2-N+2);compScatter3<=min(min(compScatter2-1,5*outerProcTileScatter2+4),compScatter1-compScatter2+N);compScatter3++) { S1(compScatter1-compScatter2+1,-compScatter1+compScatter2+compScatter3-2,compScatter2-compScatter3,compScatter1,compScatter2,compScatter3); } } } }
double test_1(){ double start_time = omp_get_wtime(); int read=0, write = 1; // s is the number of non-pointy bit 2D slices of diamond tiling // that is available for the current tile size. int s = (tau/3) - 2; // subset_s is an input parameter indicating how many of those // slices we want to use in the repeated tiling pattern. // subset_s should be less than s and greater than or equal to 2. if (subset_s > s || subset_s<2) { fprintf(stderr, "Error: need 2<=subset_s<=s\n"); exit(-1); } // Set lower and upper bounds for spatial dimensions. // When did code gen have a non-inclusive upper bound. // Ian's upper bound is inclusive. int Li=1, Lj=1, Ui=upperBound+1, Uj=upperBound+1; // Loop over the tiling pattern. for (int toffset=0; toffset<T; toffset+=subset_s){ // Loop over phases of tiles within repeated tile pattern. // This is like iterating over the A and B trapezoid tile types. for (int c0 = -2; c0 <= 0; c0 += 1){ // Two loops over tiles within one phase. // All of the tiles within one phase can be done in parallel. // updates by Dave W, to the c1 and c2 loops, for OpenMP (from here to the end of the #if BOUNDING_BOX_FOR_PARALLEL_LOOPS // hoist out min_c1 and max_c1, then use that to hoist a bounding box for c2 // initial version is just aiming for correct and parallel, without worrying about a loose boundingbox int c1_lb = max( max( floord(Lj + (tau/3) * c0 + (tau/3), tau), c0 + floord(-2 * T + Lj - 1, tau) + 1), floord(Lj + 1, tau) ); // end init block c1 int c1_ub = min( min( floord(Uj + (tau/3) * c0 - ((tau/3)+2), tau) + 1, floord(T + Uj - 1, tau)), c0 + floord(Uj - 5, tau) + 2 ); // end cond block c1 // The two expressions below are the same as in the previous version, except that // in the c2_lb_min_expr, I have replaced c1 with: // c1_min_value where it appears with a positive coefficient, and // c1_max_value where it appears with a negative coefficient. // and in the c2_ub_max_expr, the opposite (i.e., c1 becomes c1_max_value where positive) // I will be embarrassed if I have done this wrong. /// Note that I assume tau > 0 #define c2_lb_min_expr(c1_min_value, c1_max_value) \ max( \ max( \ max( \ max( \ max( \ max( \ c0 - 2 * c1_max_value + floord(-Ui + Lj + 1,tau), \ -c1_max_value + floord(-2 * Ui - Uj + tau * c0 + tau * c1_min_value - tau-3, tau*2)+1), \ c1_min_value + floord(-Ui - 2 * Uj + 3, tau)), \ floord(-Ui - Uj + 3, tau)), \ c0 - c1_max_value + floord(-Ui - (tau/3) * c0 + ((tau/3)+1), tau)), \ c0 - c1_max_value + floord(-T - Ui, tau) + 1), \ -c1_max_value + floord(-Ui + 4, tau) - 1 \ ) /* end init block c2 */ #define c2_ub_max_expr(c1_min_value, c1_max_value) \ min( \ min( \ min( \ min( \ min( \ min( \ c0 - 2 * c1_min_value + floord(-Li + Uj - 2, tau) + 1, \ c0 - c1_min_value + floord(-Li - 2, tau) + 1), \ c0 - c1_min_value + floord(-Li - (tau/3) * c0 - ((tau/3)+1), tau) + 1), \ floord(T - Li - Lj, tau)), \ -c1_min_value + floord(2 * T - Li, tau)), \ c1_max_value + floord(-Li - 2 * Lj - 1, tau) + 1), \ -c1_min_value + floord(-2 * Li - Lj + tau * c0 + tau * c1_max_value + (tau-1), tau*2) \ ) /* end cond block c2 */ #define c2_lb_expr(c1_value) c2_lb_min_expr(c1_value, c1_value) #define c2_ub_expr(c1_value) c2_ub_max_expr(c1_value, c1_value) #if BOUNDING_BOX_FOR_PARALLEL_LOOPS int c2_box_lb = c2_lb_min_expr(c1_lb, c1_ub); int c2_box_ub = c2_ub_max_expr(c1_lb, c1_ub); #if PARALLEL // don't need to mention c1...c5 below, since they're scoped inside the for loops #pragma omp parallel for shared(start_time, s, Li, Lj, Ui, Uj, toffset, c0, c1_lb, c1_ub, c2_box_lb, c2_box_ub, ) private(read, write) collapse(2) #endif for (int c1 = c1_lb; c1 <= c1_ub; c1 += 1) { for (int c2 = c2_box_lb; c2 <= c2_box_ub; c2 += 1) if (c2 >= c2_lb_expr(c1) && c2 <= c2_ub_expr(c1)) { #else for (int c1 = c1_lb; c1 <= c1_ub; c1 += 1) { for (int c2 = c2_lb_expr(c1); c2 <= c2_ub_expr(c1); c2 += 1) { #endif //fprintf(stdout, "(%d,%d,%d)\n", c0,c1,c2); // Loop over subset_s time steps within tiling pattern // and within tile c0,c1,c2. // Every time the pattern is repeated, toffset will be // subset_s bigger. // The real t value is c3+toffset. We are just using the // tiling pattern from t=1 to t<=subset_s. for (int c3 = 1; c3 <= min(T-toffset,subset_s); c3 += 1){ int t = c3+toffset; // if t % 2 is 1, then read=0 and write=1 write = t & 1; read = 1-write; // x spatial dimension. for (int c4 = max( max( max( -tau * c1 - tau * c2 + 2 * c3 - (2*tau-2), -Uj - tau * c2 + c3 - (tau-2)), tau * c0 - tau * c1 - tau * c2 - c3), Li ); // end init block c4 c4 <= min( min( min( tau * c0 - tau * c1 - tau * c2 - c3 + (tau-1), -tau * c1 - tau * c2 + 2 * c3), -Lj - tau * c2 + c3), Ui - 1 ); // end cond block c4 c4 += 1){ // y spatial dimension. for (int c5 = max( max( tau * c1 - c3, Lj), -tau * c2 + c3 - c4 - (tau-1) ); // end init block c5 c5 <= min( min( Uj - 1, -tau * c2 + c3 - c4), tau * c1 - c3 + (tau-1) ); // end cond block c5 c5 += 1){ //fprintf(stdout, "(%d,%d,%d,%d,%d,%d)\n", c0,c1,c2,c3,c4,c5); stencil( read, write, c4, c5); } // for c5 } // for c4 } // for c3 } // for c2 } // for c1 } // for c0 } // for toffset double end_time = omp_get_wtime(); return (end_time - start_time); } int main( int argc, char* argv[] ){ setbuf(stdout, NULL); // set buffer to null, so prints ALWAYS print (for debug purposes mainly) bool verify = false; bool printtime = true; // Command line parsing char c; while ((c = getopt (argc, argv, "nc:s:p:T:t:hv")) != -1){ switch( c ) { case 'n': printtime=false; break; case 'c': // problem size cores = parseInt( optarg ); if( cores <= 0 ){ fprintf(stderr, "cores must be greater than 0: %d\n", cores); exit(BAD_RUN_TIME_PARAMETERS); } break; case 's': // subset //globalSeed = parseInt( optarg ); subset_s = parseInt( optarg ); break; case 'p': // problem size problemSize = parseInt( optarg ); if( problemSize <= 0 ){ fprintf(stderr, "problemSize must be greater than 0: %d\n", problemSize); exit(BAD_RUN_TIME_PARAMETERS); } break; case 'T': // T (time steps) T = parseInt( optarg ); if( T <= 0 ){ fprintf(stderr, "T must be greater than 0: %d\n", T); exit(BAD_RUN_TIME_PARAMETERS); } break; case 't': // tau #if defined tau fprintf(stderr, "don't use -t to set tau when you compiled with -Dtau=%d.\n", tau); if (parseInt(optarg) != tau) exit(BAD_COMPILE_TIME_PARAMETERS); #else tau = parseInt( optarg ); #endif break; case 'h': // help printf("usage: %s\n-n \t dont print time \n-p <problem size> \t problem size in elements \n-T <time steps>\t number of time steps\n-c <cores>\tnumber of threads\n-s <subset_s>\t tile parameter\n-t <tau>\t tile parameter\n-h\tthis dialogue\n-v\tverify output\n", argv[0]); exit(0); case 'v': // verify; verify = true; break; case '?': if (optopt == 'p') fprintf (stderr, "Option -%c requires positive int argument: problem size.\n", optopt); else if (optopt == 'T') fprintf (stderr, "Option -%c requires positive int argument: T.\n", optopt); else if (optopt == 's') fprintf (stderr, "Option -%c requires int argument: subset_s.\n", optopt); else if (optopt == 'c') fprintf (stderr, "Option -%c requires int argument: number of cores.\n", optopt); else if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt); exit(0); default: exit(0); } } if( !( tau % 3 == 0 && tau >= 15 ) ){ #if defined tau fprintf(stderr, "tau must be a multiple of 3, and >= 15, but the program was compiled with -Dtau=%d, and thus can't run :-(\n", tau); exit(BAD_COMPILE_TIME_PARAMETERS); #else fprintf(stderr, "tau must be a multiple of 3, and >= 15, but it's %d; re-run with a different -t value\n", tau); exit(BAD_RUN_TIME_PARAMETERS); #endif } init(); initSpace(); double time = test_1(); if( printtime ) { printf( "Time: %f\n", time ); } if( verify ){ verifyResult( true ); } }
int main(void) { int t, y, x, k; double total_lattice_pts = (double)nY * (double)nX * (double)nTimesteps; /* For timekeeping */ int ts_return = -1; struct timeval start, end, result; double tdiff = 0.0; /* Compute values for global parameters */ omega = 1.0 / tau; circle_R2 = circle_radius * circle_radius; double rho_avg = (rho_in + rho_out) / 2.0; printf( "2D Flow Past Cylinder simulation with D2Q9 lattice:\n" "\tscheme : 2-Grid, Fused, Pull\n" "\tgrid size : %d x %d = %.2lf * 10^3 Cells\n" "\tnTimeSteps : %d\n" "\tomega : %.6lf\n", nX, nY, nX * nY / 1.0e3, nTimesteps, omega); /* Initialize all 9 PDFs for each point in the domain to 1.0 */ for (y = 0; y < nY + 2 + 4; y++) { for (x = 0; x < nX + 2 + 2; x++) { grid[0][y][x][0] = w1 * rho_avg; grid[1][y][x][0] = w1 * rho_avg; for (k = 1; k < 5; k++) { grid[0][y][x][k] = w2 * rho_avg; grid[1][y][x][k] = w2 * rho_avg; } for (k = 5; k < nK; k++) { grid[0][y][x][k] = w3 * rho_avg; grid[1][y][x][k] = w3 * rho_avg; } } } /* To satisfy PET */ short _nX = nX + 2; short _nY = nY + 3; int _nTimesteps = nTimesteps; #ifdef TIME gettimeofday(&start, 0); #endif int t1, t2, t3, t4, t5, t6; int lb, ub, lbp, ubp, lb2, ub2; register int lbv, ubv; /* Start of CLooG code */ if ((_nTimesteps >= 1) && (_nX >= 3) && (_nY >= 4)) { for (t1 = -1; t1 <= floord(5 * _nTimesteps + 3 * _nY - 8, 32); t1++) { lbp = max(max(ceild(4 * t1, 5), ceild(16 * t1 - _nTimesteps + 1, 12)), ceild(32 * t1 - _nTimesteps + 4, 32)); ubp = min(min(floord(4 * t1 + 4, 3), floord(_nTimesteps + _nY - 2, 8)), floord(16 * t1 + _nY + 14, 20)); #pragma omp parallel for private(lbv, ubv, t3, t4, t5, t6) for (t2 = lbp; t2 <= ubp; t2++) { for (t3 = max(max(0, ceild(4 * t1 - 3 * t2 - 1, 2)), ceild(8 * t2 - _nY - 4, 8)); t3 <= min(min(min(floord(_nTimesteps + _nX - 2, 8), floord(8 * t2 + _nX + 3, 8)), floord(16 * t1 - 12 * t2 + _nX + 18, 8)), floord(32 * t1 - 32 * t2 + _nY + _nX + 29, 8)); t3++) { for (t4 = max( max(max(max(0, 16 * t1 - 12 * t2), 32 * t1 - 32 * t2 + 3), 8 * t2 - _nY + 1), 8 * t3 - _nX + 1); t4 <= min(min(min(min(_nTimesteps - 1, 8 * t2 + 4), 8 * t3 + 5), 16 * t1 - 12 * t2 + 19), 32 * t1 - 32 * t2 + _nY + 30); t4++) { /* Hoisted loop conditional */ if (t4 % 2 == 0) { for (t5 = max(max(8 * t2, t4 + 3), -32 * t1 + 32 * t2 + 2 * t4 - 31); t5 <= min(min(8 * t2 + 7, -32 * t1 + 32 * t2 + 2 * t4), t4 + _nY - 1); t5++) { lbv = max(8 * t3, t4 + 2); ubv = min(8 * t3 + 7, t4 + _nX - 1); #pragma ivdep #pragma vector always for (t6 = lbv; t6 <= ubv; t6++) { lbm_kernel(grid[0][(-t4 + t5)][(-t4 + t6)][0], grid[0][(-t4 + t5) - 1][(-t4 + t6)][3], grid[0][(-t4 + t5) + 1][(-t4 + t6)][4], grid[0][(-t4 + t5)][(-t4 + t6) - 1][1], grid[0][(-t4 + t5)][(-t4 + t6) + 1][2], grid[0][(-t4 + t5) - 1][(-t4 + t6) - 1][5], grid[0][(-t4 + t5) - 1][(-t4 + t6) + 1][6], grid[0][(-t4 + t5) + 1][(-t4 + t6) - 1][7], grid[0][(-t4 + t5) + 1][(-t4 + t6) + 1][8], &grid[1][(-t4 + t5)][(-t4 + t6)][0], &grid[1][(-t4 + t5)][(-t4 + t6)][3], &grid[1][(-t4 + t5)][(-t4 + t6)][4], &grid[1][(-t4 + t5)][(-t4 + t6)][1], &grid[1][(-t4 + t5)][(-t4 + t6)][2], &grid[1][(-t4 + t5)][(-t4 + t6)][5], &grid[1][(-t4 + t5)][(-t4 + t6)][6], &grid[1][(-t4 + t5)][(-t4 + t6)][7], &grid[1][(-t4 + t5)][(-t4 + t6)][8], (t4), ((-t4 + t5)), ((-t4 + t6))); ; } } } else { for (t5 = max(max(8 * t2, t4 + 3), -32 * t1 + 32 * t2 + 2 * t4 - 31); t5 <= min(min(8 * t2 + 7, -32 * t1 + 32 * t2 + 2 * t4), t4 + _nY - 1); t5++) { lbv = max(8 * t3, t4 + 2); ubv = min(8 * t3 + 7, t4 + _nX - 1); #pragma ivdep #pragma vector always for (t6 = lbv; t6 <= ubv; t6++) { lbm_kernel(grid[1][(-t4 + t5)][(-t4 + t6)][0], grid[1][(-t4 + t5) - 1][(-t4 + t6)][3], grid[1][(-t4 + t5) + 1][(-t4 + t6)][4], grid[1][(-t4 + t5)][(-t4 + t6) - 1][1], grid[1][(-t4 + t5)][(-t4 + t6) + 1][2], grid[1][(-t4 + t5) - 1][(-t4 + t6) - 1][5], grid[1][(-t4 + t5) - 1][(-t4 + t6) + 1][6], grid[1][(-t4 + t5) + 1][(-t4 + t6) - 1][7], grid[1][(-t4 + t5) + 1][(-t4 + t6) + 1][8], &grid[0][(-t4 + t5)][(-t4 + t6)][0], &grid[0][(-t4 + t5)][(-t4 + t6)][3], &grid[0][(-t4 + t5)][(-t4 + t6)][4], &grid[0][(-t4 + t5)][(-t4 + t6)][1], &grid[0][(-t4 + t5)][(-t4 + t6)][2], &grid[0][(-t4 + t5)][(-t4 + t6)][5], &grid[0][(-t4 + t5)][(-t4 + t6)][6], &grid[0][(-t4 + t5)][(-t4 + t6)][7], &grid[0][(-t4 + t5)][(-t4 + t6)][8], (t4), ((-t4 + t5)), ((-t4 + t6))); ; } } } /* end hoisted if */ } } } } } /* End of CLooG code */ #ifdef TIME gettimeofday(&end, 0); ts_return = timeval_subtract(&result, &end, &start); tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6); printf("\tTime taken : %7.5lfs\n", tdiff); printf("\tMLUPS : %7.5lf\n", (total_lattice_pts / (1.0e6 * tdiff))); #endif #ifdef DEBUG /* Dump rho, uX, uY for the entire domain to verify results */ dumpVelocities(t); #endif return 0; }
if (m >= 8 * floord(m + 1, 8)) for (int c0 = 4 * floord(m + 1, 32); c0 <= n; c0 += 1) s0(c0);
/* Generated from ./non_optimal/nul_complex1.cloog by CLooG 0.18.1-2-g43fc508 gmp bits in 0.00s. */ if (n >= 0) { for (c1=0;c1<=5*n;c1++) { for (c2=max(ceild(2*c1,3),c1-n);c2<=min(floord(2*c1+2*n,3),c1);c2++) { if (c2%2 == 0) { S1(((-2*c1+3*c2)/2),(c1-c2)); } } } }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) #define S1(zT0,zT1,zT2,zT3,i,j) {B[i][j]=u2[i]*v2[j]+u1[i]*v1[j]+A[i][j];} #define S2(zT0,zT1,zT2,zT3,i,j) {x[i]=beta*B[j][i]*y[j]+x[i];} #define S3(i) {x[i]=z[i]+x[i];} #define S4(i,j) {w[i]=alpha*B[i][j]*x[j]+w[i];} int c1, c2, c3, c4, c5, c6, c7, c8, c9, c10; register int lbv, ubv; /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 0.05s. */ for (c2=0;c2<=floord(N-1,256);c2++) { for (c3=0;c3<=floord(N-1,256);c3++) { for (c4=max(0,8*c2);c4<=min(8*c2+7,floord(N-1,32));c4++) { for (c5=max(8*c3,0);c5<=min(floord(N-1,32),8*c3+7);c5++) { /*@ begin Loop( transform UnrollJam(ufactor=32) for (c6=max(32*c5,0);c6<=min(N-1,32*c5+31);c6++) { { lbv=max(32*c4,0); ubv=min(N-1,32*c4+31); #pragma ivdep #pragma vector always for (c7=lbv; c7<=ubv; c7++) { S1(c3,c2,c5,c4,c6,c7) ; S2(c2,c3,c4,c5,c7,c6) ; } } } ) @*/{ for (c6 = max(32 * c5, 0); c6 <= min(N - 1, 32 * c5 + 31) - 31; c6 = c6 + 32) { lbv=max(32*c4,0); ubv=min(N-1,32*c4+31); #pragma ivdep #pragma vector always for (c7=lbv; c7<=ubv; c7++) { S1(c3, c2, c5, c4, c6, c7); S2(c2, c3, c4, c5, c7, c6); S1(c3, c2, c5, c4, (c6 + 1), c7); S2(c2, c3, c4, c5, c7, (c6 + 1)); S1(c3, c2, c5, c4, (c6 + 2), c7); S2(c2, c3, c4, c5, c7, (c6 + 2)); S1(c3, c2, c5, c4, (c6 + 3), c7); S2(c2, c3, c4, c5, c7, (c6 + 3)); S1(c3, c2, c5, c4, (c6 + 4), c7); S2(c2, c3, c4, c5, c7, (c6 + 4)); S1(c3, c2, c5, c4, (c6 + 5), c7); S2(c2, c3, c4, c5, c7, (c6 + 5)); S1(c3, c2, c5, c4, (c6 + 6), c7); S2(c2, c3, c4, c5, c7, (c6 + 6)); S1(c3, c2, c5, c4, (c6 + 7), c7); S2(c2, c3, c4, c5, c7, (c6 + 7)); S1(c3, c2, c5, c4, (c6 + 8), c7); S2(c2, c3, c4, c5, c7, (c6 + 8)); S1(c3, c2, c5, c4, (c6 + 9), c7); S2(c2, c3, c4, c5, c7, (c6 + 9)); S1(c3, c2, c5, c4, (c6 + 10), c7); S2(c2, c3, c4, c5, c7, (c6 + 10)); S1(c3, c2, c5, c4, (c6 + 11), c7); S2(c2, c3, c4, c5, c7, (c6 + 11)); S1(c3, c2, c5, c4, (c6 + 12), c7); S2(c2, c3, c4, c5, c7, (c6 + 12)); S1(c3, c2, c5, c4, (c6 + 13), c7); S2(c2, c3, c4, c5, c7, (c6 + 13)); S1(c3, c2, c5, c4, (c6 + 14), c7); S2(c2, c3, c4, c5, c7, (c6 + 14)); S1(c3, c2, c5, c4, (c6 + 15), c7); S2(c2, c3, c4, c5, c7, (c6 + 15)); S1(c3, c2, c5, c4, (c6 + 16), c7); S2(c2, c3, c4, c5, c7, (c6 + 16)); S1(c3, c2, c5, c4, (c6 + 17), c7); S2(c2, c3, c4, c5, c7, (c6 + 17)); S1(c3, c2, c5, c4, (c6 + 18), c7); S2(c2, c3, c4, c5, c7, (c6 + 18)); S1(c3, c2, c5, c4, (c6 + 19), c7); S2(c2, c3, c4, c5, c7, (c6 + 19)); S1(c3, c2, c5, c4, (c6 + 20), c7); S2(c2, c3, c4, c5, c7, (c6 + 20)); S1(c3, c2, c5, c4, (c6 + 21), c7); S2(c2, c3, c4, c5, c7, (c6 + 21)); S1(c3, c2, c5, c4, (c6 + 22), c7); S2(c2, c3, c4, c5, c7, (c6 + 22)); S1(c3, c2, c5, c4, (c6 + 23), c7); S2(c2, c3, c4, c5, c7, (c6 + 23)); S1(c3, c2, c5, c4, (c6 + 24), c7); S2(c2, c3, c4, c5, c7, (c6 + 24)); S1(c3, c2, c5, c4, (c6 + 25), c7); S2(c2, c3, c4, c5, c7, (c6 + 25)); S1(c3, c2, c5, c4, (c6 + 26), c7); S2(c2, c3, c4, c5, c7, (c6 + 26)); S1(c3, c2, c5, c4, (c6 + 27), c7); S2(c2, c3, c4, c5, c7, (c6 + 27)); S1(c3, c2, c5, c4, (c6 + 28), c7); S2(c2, c3, c4, c5, c7, (c6 + 28)); S1(c3, c2, c5, c4, (c6 + 29), c7); S2(c2, c3, c4, c5, c7, (c6 + 29)); S1(c3, c2, c5, c4, (c6 + 30), c7); S2(c2, c3, c4, c5, c7, (c6 + 30)); S1(c3, c2, c5, c4, (c6 + 31), c7); S2(c2, c3, c4, c5, c7, (c6 + 31)); } } for (; c6 <= min(N - 1, 32 * c5 + 31); c6 = c6 + 1) { lbv=max(32*c4,0); ubv=min(N-1,32*c4+31); #pragma ivdep #pragma vector always for (c7=lbv; c7<=ubv; c7++) { S1(c3, c2, c5, c4, c6, c7); S2(c2, c3, c4, c5, c7, c6); } } } /*@ end @*/ } } } } for (c2=0;c2<=N-1;c2++) { S3(c2) ; } for (c2=0;c2<=N-1;c2++) { for (c3=0;c3<=N-1;c3++) { S4(c2,c3) ; } } /* End of CLooG code */ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return ((int) w[0]); }
for (int c0 = 4 * floord(m - 1, 12) + 4; c0 <= floord(n, 3); c0 += 4) s0(c0);
for (int c0 = 0; c0 <= 3; c0 += 1) for (int c1 = max(0, 2 * c0 - 3); c1 <= min(c0 + c0 / 2 + 1, 3); c1 += 1) for (int c2 = c0; c2 <= min(min(3, 3 * c1 + 2), 2 * c0 - c1 + 1); c2 += 1) for (int c3 = max(max(max(c1 - (-c1 + 3) / 3, 0), c2 + floord(3 * c1 - c2 - 1, 6)), 2 * c0 - (c0 + c1 + 1) / 3 - 1); c3 <= min(c0 + 1, 3); c3 += 1) for (int c4 = max(max(max(max(-200 * c1 + 400 * c3 - 199, 333 * c1 + c1 / 3), 333 * c2 + (c2 + 1) / 3), 667 * c0 - 333 * c1 - (c0 + c1 + 3) / 3 - 332), 250 * c3 + 1); c4 <= min(min(min(min(500 * c0 + 499, -200 * c1 + 400 * c3 + 400), 333 * c3 - (-c3 + 3) / 3 + 334), 1000), 333 * c2 - (-c2 + 3) / 3 + 333); c4 += 1) for (int c5 = max(max(max(1000 * c3 - 2 * c4 + 2, 1000 * c0 - c4), 500 * c1 + (c4 + 1) / 2), c4); c5 <= min(min(min(1000 * c3 - 2 * c4 + 1001, 1000 * c0 - c4 + 999), 500 * c1 + (c4 + 1) / 2 + 499), 2 * c4 + 1); c5 += 1) s0(c0, c1, c2, c3, c4, c5);
for (int c0 = 1; c0 < max((6 * M + 3 * N + 188) / 200 - 2, (N + 93) / 100 + 3 * ((2 * M + N - 4) / 200) - 1); c0 += 1) for (int c1 = max(0, floord(-N + 100 * c0 + 106, 300)); c1 <= min((2 * M + N - 4) / 200 - 1, (c0 - 1) / 3); c1 += 1) S2(c0 - c1, c1);
for (int c0 = 0; c0 <= 5 * n; c0 += 1) for (int c1 = max(-((n + c0 + 1) % 2) - n + c0 + 1, 2 * floord(c0 - 1, 3) + 2); c1 <= min(n + c0 - (n + c0 + 2) / 3, c0); c1 += 2) S1((-2 * c0 + 3 * c1) / 2, c0 - c1);
for (int c0 = 1; c0 <= min(4, floord(2 * m - 1, 17) + 1); c0 += 1) for (int c1 = 1; c1 <= min(2, -2 * c0 + (2 * m + 3 * c0 - 4) / 10 + 3); c1 += 1) for (int c2 = 0; c2 <= min(2, -c0 - c1 + (2 * m + 3 * c0 + 10 * c1 + 6) / 20 + 1); c2 += 1) for (int c3 = 8 * c0 + (c0 + 1) / 2 - 8; c3 <= min(min(30, m - 5 * c1 - 10 * c2 + 5), 8 * c0 + c0 / 2); c3 += 1) for (int c4 = 5 * c1 + 10 * c2 - 4; c4 <= min(5 * c1 + 10 * c2, m - c3 + 1); c4 += 1) s0(c0, c1, c2, c3, c4, -9 * c0 + c3 + c0 / 2 + 9, -5 * c1 - 5 * c2 + c4 + 5);
int main() { int tx, ty, x, y; int trial; IN = (double **)malloc((N+2) * sizeof(double *)); for (int i=0; i<N+2; i++) IN[i] = (double *)malloc((N) * sizeof(double)); BLURX = (double **)malloc((N) * sizeof(double *)); for (int i=0; i<N; i++) BLURX[i] = (double *)malloc((N) * sizeof(double)); OUT = (double **)malloc((N) * sizeof(double *)); for (int i=0; i<N; i++) OUT[i] = (double *)malloc((N) * sizeof(double)); init_array(); #ifdef PERFCTR PERF_INIT; #endif IF_TIME(t_start = rtclock()); for (trial = 0; trial < 10 ; ++trial) { #pragma scop for(ty = 0; ty <= floord((N-1),B); ++ty) #pragma omp parallel for private(tx,x,y) for(tx = 0; tx <= floord((N-1),B); ++tx){ for(x = 0; x <= B-1; ++x){ for(y = 0; y <= B-1; ++y) /* P */ blurx(x,y)=in(x,y)+in((x+1),y)+in((x+2),y); for(y = 0; y <= B-1; ++y) if(ty*B+y>=2) /* Q */ out(x,y)=blurx(x,y)+blurx(x,(y-1))+blurx(x,(y-2)); } } #pragma endscop } IF_TIME(t_end = rtclock()); IF_TIME(fprintf(stderr, "File:%s \t\t N=%d,T=%d \t Runtime=%0.6lfs\n", __FILE__, N, B, (t_end - t_start)/10)); #ifdef PERFCTR PERF_EXIT; #endif #ifdef VERIFY for(x = 0; x <= N-1; ++x) for(y = 0; y <= N-1; ++y) BLURX[x][y]=IN[x][y]+IN[x+1][y]+IN[x+2][y]; // Stage 2: vertical blur for(x = 0; x <= N-1; ++x) for(y = 2; y <= N-1; ++y) { if(OUT[x][y] != BLURX[x][y]+BLURX[x][y-1]+BLURX[x][y-2]) { printf("Difference at (%d, %d) : %f versus %f\n", x, y, OUT[x][y], BLURX[x][y]+BLURX[x][y-1]+BLURX[x][y-2]); break; } } #endif if (fopen(".test", "r")) { // print_array(); } return 0; }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); int t, i, j, k, l, m, n,ii,jj; #define S1(zT0,zT1,t,j) {ey[0][j]=t;} #define S2(zT0,zT1,zT2,t,i,j) {ey[i][j]=ey[i][j]-((double)(1))/2*(hz[i][j]-hz[i-1][j]);} #define S3(zT0,zT1,zT2,t,i,j) {ex[i][j]=ex[i][j]-((double)(1))/2*(hz[i][j]-hz[i][j-1]);} #define S4(zT0,zT1,zT2,t,i,j) {hz[i][j]=hz[i][j]-((double)(7))/10*(ey[1+i][j]+ex[i][1+j]-ex[i][j]-ey[i][j]);} int c1, c2, c3, c4, c5, c6, c7; register int lbv, ubv; for (c1=0;c1<=floord(tmax-1,32);c1++) { for (c2=max(ceild(32*c1-31,32),0);c2<=min(floord(tmax+ny-1,32),floord(32*c1+ny+31,32));c2++) { for (c3=max(max(max(max(ceild(32*c2-ny-30,32),0),ceild(64*c1-32*c2-61,32)),ceild(32*c1-31,32)),ceild(32*c1-992*c2-1891,992));c3<=min(min(floord(32*c2+nx+30,32),floord(tmax+nx-1,32)),floord(32*c1+nx+31,32));c3++) { if ((c1 <= floord(32*c3-nx,32)) && (c2 <= floord(32*c3-nx+ny,32)) && (c3 >= ceild(nx,32))) { for (c5=max(32*c3-nx+1,32*c2);c5<=min(32*c2+31,32*c3-nx+ny);c5++) { S4(c1,-c1+c3,-c1+c2,32*c3-nx,nx-1,-32*c3+c5+nx-1) ; } } if ((c1 <= floord(32*c2-ny,32)) && (c2 >= max(ceild(32*c3-nx+ny+1,32),ceild(ny,32)))) { for (c6=max(32*c3,32*c2-ny+1);c6<=min(32*c2+nx-ny,32*c3+31);c6++) { S4(c1,-c1+c3,-c1+c2,32*c2-ny,-32*c2+c6+ny-1,ny-1) ; } } if (c1 == c3) { for (c4=max(max(32*c2-ny+1,0),32*c3);c4<=min(min(32*c3+30,32*c2-ny+31),tmax-1);c4++) { for (c5=32*c2;c5<=c4+ny-1;c5++) { S1(c1,-c1+c2,c4,-c4+c5) ; S3(c1,0,-c1+c2,c4,0,-c4+c5) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } for (c6=c4+1;c6<=32*c3+31;c6++) { S4(c1,0,-c1+c2,c4,-c4+c6-1,ny-1) ; } } } if (c1 == c3) { for (c4=max(max(0,32*c3),32*c2-ny+32);c4<=min(min(tmax-1,32*c3+30),32*c2-1);c4++) { for (c5=32*c2;c5<=32*c2+31;c5++) { S1(c1,-c1+c2,c4,-c4+c5) ; S3(c1,0,-c1+c2,c4,0,-c4+c5) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } } } if (c1 == c3) { for (c4=max(max(32*c2,0),32*c3);c4<=min(min(tmax-1,32*c3+30),32*c2+30);c4++) { S1(c1,-c1+c2,c4,0) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,0) ; } for (c5=c4+1;c5<=32*c2+31;c5++) { S1(c1,-c1+c2,c4,-c4+c5) ; S3(c1,0,-c1+c2,c4,0,-c4+c5) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } } } for (c4=max(max(max(32*c1,0),32*c2-ny+1),32*c3-nx+1);c4<=min(min(min(32*c3-nx+31,32*c2-ny+31),32*c1+31),tmax-1);c4++) { for (c5=32*c2;c5<=c4+ny-1;c5++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ; } for (c6=32*c3;c6<=c4+nx;c6++) { S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,ny-1) ; } } for (c4=max(max(max(32*c1,0),32*c3-nx+1),32*c2-ny+32);c4<=min(min(min(tmax-1,32*c1+31),32*c2-1),32*c3-nx+31);c4++) { for (c5=32*c2;c5<=32*c2+31;c5++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ; } } for (c4=max(max(max(32*c3-nx+32,32*c1),0),32*c2-ny+1);c4<=min(min(min(32*c2-ny+31,32*c1+31),tmax-1),32*c3-1);c4++) { for (c5=32*c2;c5<=c4+ny-1;c5++) { for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } for (c6=32*c3;c6<=32*c3+31;c6++) { S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,ny-1) ; } } for (c4=max(max(max(32*c2,32*c1),0),32*c3-nx+1);c4<=min(min(min(32*c2+30,tmax-1),32*c1+31),32*c3-nx+31);c4++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,0) ; } for (c5=c4+1;c5<=32*c2+31;c5++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ; } } for (c4=max(max(max(32*c1,0),32*c3-nx+32),32*c2-ny+32);c4<=min(min(min(32*c3-1,tmax-1),32*c1+31),32*c2-1);c4++) { /*@ begin Loop( transform Composite( tile = [('c5',T1,'ii'),('c6',T2,'jj')], permut = [PERMUTS], unrolljam = [('c5',U1),('c6',U2)], vector = (VEC, ['ivdep','vector always']) ) for (c5=32*c2;c5<=32*c2+31;c5++) for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } ) @*/{ for (c6=32*c3; c6<=32*c3+28; c6=c6+4) { register int cbv_1, cbv_2; cbv_1=32*c2; cbv_2=32*c2+31; #pragma ivdep #pragma vector always for (c5=cbv_1; c5<=cbv_2; c5=c5+1) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5); S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5); S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+3,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+3,-c4+c5); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5-1); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5-1); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5-1); } } for (; c6<=32*c3+31; c6=c6+1) { register int cbv_3, cbv_4; cbv_3=32*c2; cbv_4=32*c2+31; #pragma ivdep #pragma vector always for (c5=cbv_3; c5<=cbv_4; c5=c5+1) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1); } } } /*@ end @*/ } for (c4=max(max(max(32*c2,32*c3-nx+32),32*c1),0);c4<=min(min(min(32*c3-1,32*c2+30),tmax-1),32*c1+31);c4++) { for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,0) ; } for (c5=c4+1;c5<=32*c2+31;c5++) { for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } } if ((c1 == c3) && (c2 <= min(floord(32*c3-1,32),floord(tmax-32,32)))) { S1(c1,-c1+c2,32*c2+31,0) ; for (c6=32*c2+32;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,32*c2+31,-32*c2+c6-31,0) ; } } if ((-c1 == -c3) && (c1 >= ceild(32*c2-31,32)) && (c1 <= min(floord(tmax-32,32),floord(32*c2-1,32)))) { S1(c1,-c1+c2,32*c1+31,0) ; for (c5=32*c1+32;c5<=32*c2+31;c5++) { S1(c1,-c1+c2,32*c1+31,-32*c1+c5-31) ; S3(c1,0,-c1+c2,32*c1+31,0,-32*c1+c5-31) ; } } if ((-c1 == -c3) && (c1 <= min(floord(tmax-32,32),c2-1))) { for (c5=32*c2;c5<=min(32*c2+31,32*c1+ny+30);c5++) { S1(c1,-c1+c2,32*c1+31,-32*c1+c5-31) ; S3(c1,0,-c1+c2,32*c1+31,0,-32*c1+c5-31) ; } } if ((-c1 == -c2) && (-c1 == -c3) && (c1 <= floord(tmax-32,32))) { S1(c1,0,32*c1+31,0) ; } if ((c1 >= c2) && (c2 <= min(c3-1,floord(tmax-32,32)))) { for (c6=32*c3;c6<=min(32*c2+nx+30,32*c3+31);c6++) { S2(c1,-c1+c3,-c1+c2,32*c2+31,-32*c2+c6-31,0) ; } } } } } annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return 1; }
for (int c0 = 0; c0 <= 3; c0 += 1) for (int c1 = max(0, 2 * c0 - 3); c1 <= min(c0 + 1, 3); c1 += 1) for (int c2 = c0; c2 <= min(min(3, 3 * c1 + 2), 2 * c0 - c1 + 1); c2 += 1) for (int c3 = max(max(max(c2 - (c2 + 2) / 3, c2 + floord(3 * c1 - c2 - 1, 6)), c1 - (-c1 + 3) / 3), c0 - (-c2 + 3) / 3); c3 <= min(c0 + c0 / 2 + 1, 3); c3 += 1) for (int c5 = max(max(max(max(c1 - (-c1 + 3) / 3, 0), 2 * c3 - 4), c3 - (c3 + 3) / 3), c2 - (c2 + 3) / 3); c5 <= min(min(-c2 + 2 * c3 - (c2 + 3) / 3 + 2, c1 + 1), c3); c5 += 1) for (int c6 = max(max(max(max(max(250 * c3 + 1, 667 * c0 - 333 * c1 - (c0 + c1 + 3) / 3 - 332), -200 * c1 + 400 * c3 - 199), 333 * c1 + c1 / 3), 1000 * c0 - 500 * c5 - 501), 333 * c2 + (c2 + 1) / 3); c6 <= min(min(min(min(min(min(333 * c3 - (-c3 + 3) / 3 + 334, 1000), 333 * c2 - (-c2 + 3) / 3 + 333), 1000 * c0 - 500 * c5 + 997), 500 * c5 + 501), 500 * c0 + 499), -200 * c1 + 400 * c3 + 400); c6 += 1) for (int c7 = max(max(max(max(c6, 500 * c1 + (c6 + 1) / 2), 1000 * c0 - c6), 500 * c5 + 2), 1000 * c3 - 2 * c6 + 2); c7 <= min(min(min(min(500 * c5 + 501, 2 * c6 + 1), 1000 * c3 - 2 * c6 + 1001), 1000 * c0 - c6 + 999), 500 * c1 + (c6 + 1) / 2 + 499); c7 += 1) s0(c0, c1, c2, c3, c2 / 3, c5, c6, c7);
for (int c0 = floord(m, 4); c0 <= n; c0 += 1) s0(c0);
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); /*@ begin PerfTuning ( def build { arg build_command = 'icc -O3 -openmp -I/usr/local/icc/include -lm'; } def performance_counter { arg repetitions = 1; } def performance_params { # param T1_1[] = [1,16,32,64,128]; # param T1_2[] = [1,16,32,64,128]; # param T1_3[] = [1,16,32,64,128]; # param T2_1[] = [1,4,8,16,32]; # param T2_2[] = [1,4,8,16,32]; # param T2_3[] = [1,4,8,16,32]; param T1_1[] = [64]; param T1_2[] = [256]; param T1_3[] = [64]; param T2_1[] = [1]; param T2_2[] = [1]; param T2_3[] = [1]; constraint c1 = (T1_1*T2_1<=1024 and T1_1*T2_1<=1024 and T1_1*T2_1<=1024); constraint c2 = ((T1_1 == T1_3) and (T2_1 == T2_3)); param U1[] = [1]; param U2[] = [1]; param U3[] = [7]; constraint c3 = (U1*U2*U3<=512); param PERM[] = [ #[0,1,2], #[0,2,1], #[1,0,2], #[1,2,0], [2,0,1], #[2,1,0], ]; param PAR[] = [True]; param SCREP[] = [False]; param IVEC[] = [True]; } def search { arg algorithm = 'Exhaustive'; # arg algorithm = 'Simplex'; # arg time_limit = 5; # arg total_runs = 1; } def input_params { param N[] = [1024]; } def input_vars { arg decl_file = 'decl_code.h'; arg init_file = 'init_code.c'; } ) @*/ /**-- (Generated by Orio) Best performance cost: 0.201184 Tuned for specific problem sizes: N = 1024 Best performance parameters: IVEC = True PAR = True PERM = [2, 0, 1] SCREP = False T1_1 = 64 T1_2 = 256 T1_3 = 64 T2_1 = 1 T2_2 = 1 T2_3 = 1 U1 = 1 U2 = 1 U3 = 7 --**/ register int i,j,k; register int c1t, c2t, c3t, c4t, c5t, c6t, c7t, c8t, c9t, c10t, c11t, c12t; register int newlb_c1, newlb_c2, newlb_c3, newlb_c4, newlb_c5, newlb_c6, newlb_c7, newlb_c8, newlb_c9, newlb_c10, newlb_c11, newlb_c12; register int newub_c1, newub_c2, newub_c3, newub_c4, newub_c5, newub_c6, newub_c7, newub_c8, newub_c9, newub_c10, newub_c11, newub_c12; /*@ begin PolySyn( parallel = PAR; tiles = [T1_1,T1_2,T1_3,T2_1,T2_2,T2_3]; permut = PERM; unroll_factors = [U1,U2,U3]; scalar_replace = SCREP; vectorize = IVEC; profiling_code = 'lu_profiling.c'; compile_cmd = 'gcc'; compile_opts = '-lm'; ) @*/ #include <math.h> #include <assert.h> #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lb, ub, lb1, ub1, lb2, ub2; /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 0.05s. */ for (c1=-1;c1<=floord(5*N-9,256);c1++) { lb1=max(max(ceild(32*c1-127,160),ceild(64*c1-N+2,64)),0); ub1=min(floord(64*c1+63,64),floord(N-1,256)); #pragma omp parallel for shared(c1,lb1,ub1) private(c2,c3,c4,c5,c6,c7,c8,c9) for (c2=lb1; c2<=ub1; c2++) { for (c3=max(ceild(32*c1-32*c2-1953,2016),ceild(32*c1-32*c2-31,32));c3<=floord(N-1,64);c3++) { if (c1 == c2+c3) { for (c7=max(64*c3,0);c7<=min(min(N-2,64*c3+62),256*c2+254);c7++) { for (c8=max(c7+1,256*c2);c8<=min(N-1,256*c2+255);c8++) { A[c7][c8]=A[c7][c8]/A[c7][c7] ; for (c9=c7+1;c9<=min(N-1,64*c3+63);c9++) { A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8] ; } } } } /*@ begin Loop( transform Composite( permut = [['c9', 'c7', 'c8']], regtile = (['c7', 'c8', 'c9'],[1, 1, 7]), scalarreplace = (False, 'double'), vector = (True, ['ivdep','vector always'])) for (c7=max(0,64*c1-64*c2);c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1);c7++) { for (c8=max(c7+1,256*c2);c8<=min(256*c2+255,N-1);c8++) { for (c9=64*c3;c9<=min(N-1,64*c3+63);c9++) { A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8] ; } } } ) @*/{ for (c9t=64*c3; c9t<=min(N-1,64*c3+63)-6; c9t=c9t+7) { for (c7=max(0,64*c1-64*c2); c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1); c7++ ) { register int cbv_1, cbv_2; cbv_1=max(c7+1,256*c2); cbv_2=min(256*c2+255,N-1); #pragma ivdep #pragma vector always for (c8=cbv_1; c8<=cbv_2; c8++ ) { A[c9t][c8]=A[c9t][c8]-A[c9t][c7]*A[c7][c8]; A[(c9t+1)][c8]=A[(c9t+1)][c8]-A[(c9t+1)][c7]*A[c7][c8]; A[(c9t+2)][c8]=A[(c9t+2)][c8]-A[(c9t+2)][c7]*A[c7][c8]; A[(c9t+3)][c8]=A[(c9t+3)][c8]-A[(c9t+3)][c7]*A[c7][c8]; A[(c9t+4)][c8]=A[(c9t+4)][c8]-A[(c9t+4)][c7]*A[c7][c8]; A[(c9t+5)][c8]=A[(c9t+5)][c8]-A[(c9t+5)][c7]*A[c7][c8]; A[(c9t+6)][c8]=A[(c9t+6)][c8]-A[(c9t+6)][c7]*A[c7][c8]; } } } for (c9=c9t; c9<=min(N-1,64*c3+63); c9=c9+1) { for (c7=max(0,64*c1-64*c2); c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1); c7++ ) { register int cbv_3, cbv_4; cbv_3=max(c7+1,256*c2); cbv_4=min(256*c2+255,N-1); #pragma ivdep #pragma vector always for (c8=cbv_3; c8<=cbv_4; c8++ ) { A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8]; } } } } /*@ end @*/ if ((-c1 == -c2-c3) && (c1 <= min(floord(320*c2+191,64),floord(64*c2+N-65,64)))) { for (c8=max(256*c2,64*c1-64*c2+64);c8<=min(256*c2+255,N-1);c8++) { A[64*c1-64*c2+63][c8]=A[64*c1-64*c2+63][c8]/A[64*c1-64*c2+63][64*c1-64*c2+63] ; } } } } } /* End of CLooG code */ /*@ end @*/ /*@ end @*/ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; #ifndef TEST printf("%f\n", annot_t_total); #else { int i, j; for (i=0; i<N; i++) { for (j=0; j<N; j++) { if (j%100==0) printf("\n"); printf("%f ",A[i][j]); } printf("\n"); } } #endif return ((int) A[0][0]); }
/* Generated from ../../../git/cloog/test/isl/jacobi-shared.cloog by CLooG 0.16.3-2-g5511bef gmp bits in 1.82s. */ if ((h0+1)%2 == 0) { if ((16*floord(t0-1,16) >= -N+g1+t0+1) && (16*floord(N+15*g1+15*t0+15,16) >= 15*g1+15*t0+19) && (32*floord(t1-1,32) <= g2+t1-3) && (32*floord(t1-1,32) >= -N+g2+t1+1)) { for (c0=max(-16*floord(t0-1,16)+t0,-16*floord(g1+t0-3,16)+t0);c0<=min(32,N-g1-1);c0+=16) { c1 = -32*floord(t1-1,32)+t1; if (c1 <= 32) { S1(c0+g1-1,c1+g2-1); } } } }
for (int c0 = 0; c0 <= 5 * n; c0 += 1) for (int c1 = max(-((5 * n - c0 + 1) % 2) - n + c0 + 1, 2 * floord(c0 - 1, 3) + 2); c1 <= min(c0, n + c0 - (n + c0 + 2) / 3); c1 += 2) S1((3 * c1 / 2) - c0, c0 - c1);
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); #include <math.h> #include <assert.h> #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) #define S1(zT0,zT1,zT2,zT3,zT4,zT5,t,i,j) {A[i][j]=(A[1+i][1+j]+A[1+i][j]+A[1+i][j-1]+A[i][1+j]+A[i][j]+A[i][j-1]+A[i-1][1+j]+A[i-1][j]+A[i-1][j-1])/9;} int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lb, ub, lb1, ub1, lb2, ub2; register int lbv, ubv; omp_set_nested(1); omp_set_num_threads(2); /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 5.45s. */ for (c1=-2;c1<=floord(4*T+3*N-10,256);c1++) { lb1=max(max(max(0,ceild(256*c1-2*T-N-251,512)),ceild(256*c1-3*T-2*N+7,256)),ceild(256*c1-N-761,1024)); ub1=min(min(min(floord(256*c1+2*N+505,1024),floord(256*c1+509,512)),floord(64*c1+127,64)),floord(T+N-3,256)); #pragma omp parallel for shared(c1,lb1,ub1) private(lb2,ub2,c2,c3,c4,c5,c6,c7,c8,c9) for (c2=lb1; c2<=ub1; c2++) { lb2=max(max(max(max(max(max(ceild(256*c1-256*c2-T+1,256),ceild(512*c1-512*c2-253,768)),0),ceild(512*c2-N-252,256)),ceild(128*c1-256*c2-127,128)),ceild(128*c2-127,128)),ceild(128*c1-127,256)); ub2=min(min(min(min(min(min(floord(256*c1-256*c2+255,256),floord(256*c1-512*c2+N+253,256)),floord(256*c2+T+N+252,256)),floord(T+N-3,128)),floord(256*c1+N+508,512)),floord(256*c1-256*c2+N+253,384)),floord(512*c2+N+507,256)); #pragma omp parallel for shared(c1,c2,lb1,ub1,lb2,ub2) private(c3,c4,c5,c6,c7,c8,c9) for (c3=lb2; c3<=ub2; c3++) { for (c4=max(max(max(max(0,ceild(-256*c2+256*c3-N-284,32)),8*c1-8*c2-8*c3),ceild(256*c2-N-29,32)),ceild(128*c3-N-29,32));c4<=min(min(min(min(8*c1-8*c2-8*c3+7,floord(256*c3+253,64)),floord(T-1,32)),floord(128*c2+127,16)),floord(-128*c2+128*c3+127,16));c4++) { for (c5=max(max(max(max(max(8*c2,ceild(16*c4-15,16)),ceild(256*c3-T-N-28,32)),0),ceild(256*c3-32*c4-N-60,32)),ceild(256*c3-N-59,64));c5<=min(min(min(min(min(floord(32*c4+N+29,32),floord(128*c3+127,16)),8*c2+7),floord(128*c3-16*c4+127,16)),floord(T+N-3,32)),floord(256*c3+N+252,64));c5++) { for (c6=max(max(max(max(max(ceild(64*c4-29,32),8*c3),ceild(16*c5-15,16)),ceild(16*c4+16*c5-15,16)),0),ceild(64*c5-N-28,32));c6<=min(min(min(min(min(8*c3+7,floord(T+N-3,16)),floord(32*c4+32*c5+N+60,32)),floord(32*c4+N+29,16)),floord(64*c5+N+59,32)),floord(32*c5+T+N+28,32));c6++) { for (c7=max(max(max(max(0,32*c4),32*c5-N+2),16*c6-N+2),-32*c5+32*c6-N-29);c7<=min(min(min(min(-32*c5+32*c6+30,floord(32*c6+29,2)),T-1),32*c5+30),32*c4+31);c7++) { /*@ begin Loop( transform UnrollJam(ufactor=8) for (c8=max(max(32*c5,c7+1),32*c6-c7-N+2);c8<=min(min(32*c6-c7+30,32*c5+31),c7+N-2);c8++) transform Unroll(ufactor=8) for (c9=max(c7+c8+1,32*c6);c9<=min(32*c6+31,c7+c8+N-2);c9++) { S1(c1-c2-c3,-c1+2*c2+c3,-c1+2*c3,c4,-c4+c5,-c4-c5+c6,c7,-c7+c8,-c7-c8+c9) ; } ) @*/ for (c8=max(max(32*c5,c7+1),32*c6-c7-N+2);c8<=min(min(32*c6-c7+30,32*c5+31),c7+N-2);c8++) { for (c9=max(c7+c8+1,32*c6);c9<=min(32*c6+31,c7+c8+N-2);c9++) { S1(c1-c2-c3,-c1+2*c2+c3,-c1+2*c3,c4,-c4+c5,-c4-c5+c6,c7,-c7+c8,-c7-c8+c9) ; } } /*@ end @*/ } } } } } } } /* End of CLooG code */ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return ((int) A[0][0]); }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; omp_set_nested(1); omp_set_num_threads(2); for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); register int i,j,k; #define S1(zT0,zT1,zT2,zT3,k,j) {A[k][j]=A[k][j]/A[k][k];} #define S2(zT0,zT1,zT2,zT3,zT4,zT5,k,i,j) {A[i][j]=A[i][j]-A[i][k]*A[k][j];} int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lb, ub, lb1, ub1, lb2, ub2; register int lbv, ubv; /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 2.21s. */ for (c1=-2;c1<=floord(3*N-4,256);c1++) { lb1=max(max(0,ceild(256*c1-N-253,512)),ceild(256*c1-2*N+3,256)); ub1=min(floord(128*c1+255,128),floord(N-1,256)); #pragma omp parallel for shared(c1,lb1,ub1) private(lb2,ub2,c2,c3,c4,c5,c6,c7,c8,c9) for (c2=lb1; c2<=ub1; c2++) { lb2=max(max(max(ceild(256*c1-256*c2-N+2,256),ceild(128*c1-256*c2-127,128)),ceild(128*c1-128*c2-32385,32768)),ceild(128*c1-128*c2-127,256)); ub2=min(floord(N-1,256),floord(256*c1-256*c2+255,256)); #pragma omp parallel for shared(c1,c2,lb1,ub1,lb2,ub2) private(c3,c4,c5,c6,c7,c8,c9) for (c3=lb2; c3<=ub2; c3++) { for (c4=max(max(8*c1-8*c2-8*c3,0),8*c1-8*c2-1800*c3-1778);c4<=min(min(min(min(floord(3968*c3+3937,16),8*c1-8*c2-8*c3+7),floord(128*c2+127,16)),floord(N-2,32)),floord(128*c3+127,16));c4++) { for (c5=max(max(ceild(16*c4-15,16),0),8*c2);c5<=min(floord(N-1,32),8*c2+7);c5++) { for (c6=max(max(max(max(ceild(16*c4-465,496),ceild(8*c1-8*c2-16*c3-c4-217,223)),ceild(-8*c1+8*c2+16*c3+c4-217,225)),8*c3),ceild(16*c4-15,16));c6<=min(8*c3+7,floord(N-1,32));c6++) { if ((c1 == c2+2*c3) && (c4 == c6)) { for (c7=max(0,32*c6);c7<=min(min(32*c5+30,32*c6+30),N-2);c7++) { for (c8=max(c7+1,32*c5);c8<=min(32*c5+31,N-1);c8++) { if ((c1-c2)%2 == 0) { S1((c1-c2)/2,c2,c4,c5,c7,c8) ; } for (c9=c7+1;c9<=min(32*c6+31,N-1);c9++) { if ((c1-c2)%2 == 0) { if ((c1-c2)%2 == 0) { S2((c1-c2)/2,(c1-c2)/2,c2,c4,c4,c5,c7,c9,c8) ; } } } } } } for (c7=max(32*c4,0);c7<=min(min(32*c6-1,32*c5+30),32*c4+31);c7++) { /*@ begin Loop( transform UnrollJam(ufactor=8) for (c8=max(c7+1,32*c5);c8<=min(32*c5+31,N-1);c8++) transform Unroll(ufactor=8) for (c9=32*c6;c9<=min(N-1,32*c6+31);c9++) { S2(c1-c2-c3,c3,c2,c4,c6,c5,c7,c9,c8) ; } ) @*/{ for (c8 = max(c7 + 1, 32 * c5); c8 <= min(32 * c5 + 31, N - 1) - 7; c8 = c8 + 8) { for (c9 = 32 * c6; c9 <= min(N - 1, 32 * c6 + 31) - 7; c9 = c9 + 8) { S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 7)); } for (; c9 <= min(N - 1, 32 * c6 + 31); c9 = c9 + 1) { S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 7)); } } for (; c8 <= min(32 * c5 + 31, N - 1); c8 = c8 + 1) { for (c9 = 32 * c6; c9 <= min(N - 1, 32 * c6 + 31) - 7; c9 = c9 + 8) { S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), c8); } for (; c9 <= min(N - 1, 32 * c6 + 31); c9 = c9 + 1) S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); } } /*@ end @*/ } if ((c1 == c2+2*c3) && (-c4 == -c6) && (c4 <= min(floord(N-33,32),floord(32*c5-1,32)))) { for (c8=max(32*c5,32*c4+32);c8<=min(N-1,32*c5+31);c8++) { if ((c1-c2)%2 == 0) { S1((c1-c2)/2,c2,c4,c5,32*c4+31,c8) ; } } } } } } } } } /* End of CLooG code */ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return ((int) A[0][0]); }
int main(void) { int t = 0, z, y, x, k; double total_lattice_pts = (double)nZ * (double)nY * (double)nX * (double)nTimesteps; /* For timekeeping */ int ts_return = -1; struct timeval start, end, result; double tdiff = 0.0; /* Compute values for global parameters */ omega = 2.0 / ((6.0 * sqrt(uTopX * uTopX + uTopY * uTopY) * (nX - 0.5) / re) + 1.0); printf( "3D Lid Driven Cavity simulation with D3Q19 lattice:\n" "\tscheme : 3-Grid, Fused, Pull\n" "\tgrid size : %d x %d x %d = %.2lf * 10^3 Cells\n" "\tnTimeSteps : %d\n" "\tRe : %.2lf\n" "\tuTopX : %.6lf\n" "\tuTopY : %.6lf\n" "\tomega : %.6lf\n", nX, nY, nZ, nX * nY * nZ / 1.0e3, nTimesteps, re, uTopX, uTopY, omega); /* Initialize all 19 PDFs for each point in the domain to w1, w2 or w3 * accordingly */ for (z = 0; z < nZ + 2 + 4; z++) { for (y = 0; y < nY + 2 + 2; y++) { for (x = 0; x < nX + 2 + 2; x++) { grid[0][z][y][x][0] = w1; grid[1][z][y][x][0] = w1; for (k = 1; k < 7; k++) { grid[0][z][y][x][k] = w2; grid[1][z][y][x][k] = w2; } for (k = 7; k < nK; k++) { grid[0][z][y][x][k] = w3; grid[1][z][y][x][k] = w3; } } } } /* To satisfy PET */ short _nX = nX + 3; short _nY = nY + 3; short _nZ = nZ + 4; short _nTimesteps = nTimesteps; #ifdef TIME gettimeofday(&start, 0); #endif int t1, t2, t3, t4, t5, t6, t7, t8; int lb, ub, lbp, ubp, lb2, ub2; register int lbv, ubv; /* Start of CLooG code */ if ((_nTimesteps >= 1) && (_nX >= 2) && (_nY >= 2) && (_nZ >= 3)) { for (t1 = -1; t1 <= floord(_nTimesteps - 1, 8); t1++) { lbp = max(ceild(t1, 2), ceild(16 * t1 - _nTimesteps + 3, 16)); ubp = min(floord(_nTimesteps + _nZ - 2, 16), floord(8 * t1 + _nZ + 6, 16)); #pragma omp parallel for private(lbv, ubv, t3, t4, t5, t6, t7, t8) for (t2 = lbp; t2 <= ubp; t2++) { for (t3 = max(max(0, ceild(t1 - 1, 2)), ceild(16 * t2 - _nZ - 13, 16)); t3 <= min(min(min(floord(_nTimesteps + _nY - 2, 16), floord(8 * t1 + _nY + 14, 16)), floord(16 * t2 + _nY + 12, 16)), floord(16 * t1 - 16 * t2 + _nZ + _nY + 13, 16)); t3++) { for (t4 = max( max(max(0, ceild(t1 - 1, 2)), ceild(16 * t2 - _nZ - 13, 16)), ceild(16 * t3 - _nY - 13, 16)); t4 <= min(min(min(min(floord(_nTimesteps + _nX - 2, 16), floord(8 * t1 + _nX + 14, 16)), floord(16 * t2 + _nX + 12, 16)), floord(16 * t3 + _nX + 13, 16)), floord(16 * t1 - 16 * t2 + _nZ + _nX + 13, 16)); t4++) { for (t5 = max(max(max(max(max(0, 8 * t1), 16 * t1 - 16 * t2 + 2), 16 * t2 - _nZ + 1), 16 * t3 - _nY + 1), 16 * t4 - _nX + 1); t5 <= min(min(min(min(min(_nTimesteps - 1, 8 * t1 + 15), 16 * t2 + 13), 16 * t3 + 14), 16 * t4 + 14), 16 * t1 - 16 * t2 + _nZ + 14); t5++) { /* Hoisted loop conditional */ if (t5 % 2 == 0) { for (t6 = max(max(16 * t2, t5 + 2), -16 * t1 + 16 * t2 + 2 * t5 - 15); t6 <= min(min(16 * t2 + 15, -16 * t1 + 16 * t2 + 2 * t5), t5 + _nZ - 1); t6++) { for (t7 = max(16 * t3, t5 + 1); t7 <= min(16 * t3 + 15, t5 + _nY - 1); t7++) { lbv = max(16 * t4, t5 + 1); ubv = min(16 * t4 + 15, t5 + _nX - 1); #pragma ivdep #pragma vector always for (t8 = lbv; t8 <= ubv; t8++) { lbm_kernel( grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) - 1][1], grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) + 1][2], grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8)][3], grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8)][4], grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8)][5], grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8)][6], grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) - 1] [7], grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) + 1] [8], grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) - 1] [9], grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) + 1] [10], grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) - 1] [11], grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) + 1] [12], grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) - 1] [13], grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) + 1] [14], grid[0][(-t5 + t6) - 1][(-t5 + t7) - 1][(-t5 + t8)] [15], grid[0][(-t5 + t6) - 1][(-t5 + t7) + 1][(-t5 + t8)] [16], grid[0][(-t5 + t6) + 1][(-t5 + t7) - 1][(-t5 + t8)] [17], grid[0][(-t5 + t6) + 1][(-t5 + t7) + 1][(-t5 + t8)] [18], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][1], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][2], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][3], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][4], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][5], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][6], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][7], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][8], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][9], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][10], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][11], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][12], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][13], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][14], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][15], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][16], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][17], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][18], (t5), ((-t5 + t6)), ((-t5 + t7)), ((-t5 + t8))); ; } } } } else { for (t6 = max(max(16 * t2, t5 + 2), -16 * t1 + 16 * t2 + 2 * t5 - 15); t6 <= min(min(16 * t2 + 15, -16 * t1 + 16 * t2 + 2 * t5), t5 + _nZ - 1); t6++) { for (t7 = max(16 * t3, t5 + 1); t7 <= min(16 * t3 + 15, t5 + _nY - 1); t7++) { lbv = max(16 * t4, t5 + 1); ubv = min(16 * t4 + 15, t5 + _nX - 1); #pragma ivdep #pragma vector always for (t8 = lbv; t8 <= ubv; t8++) { lbm_kernel( grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) - 1][1], grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) + 1][2], grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8)][3], grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8)][4], grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8)][5], grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8)][6], grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) - 1] [7], grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) + 1] [8], grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) - 1] [9], grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) + 1] [10], grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) - 1] [11], grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) + 1] [12], grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) - 1] [13], grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) + 1] [14], grid[1][(-t5 + t6) - 1][(-t5 + t7) - 1][(-t5 + t8)] [15], grid[1][(-t5 + t6) - 1][(-t5 + t7) + 1][(-t5 + t8)] [16], grid[1][(-t5 + t6) + 1][(-t5 + t7) - 1][(-t5 + t8)] [17], grid[1][(-t5 + t6) + 1][(-t5 + t7) + 1][(-t5 + t8)] [18], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][1], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][2], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][3], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][4], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][5], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][6], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][7], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][8], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][9], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][10], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][11], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][12], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][13], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][14], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][15], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][16], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][17], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][18], (t5), ((-t5 + t6)), ((-t5 + t7)), ((-t5 + t8))); ; } } } } /* end hoisted if */ } } } } } } /* End of CLooG code */ #ifdef TIME gettimeofday(&end, 0); ts_return = timeval_subtract(&result, &end, &start); tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6); printf("\tTime taken : %7.5lfm\n", tdiff / 60.0); printf("\tMLUPS : %7.5lf\n", (total_lattice_pts / (1.0e6 * tdiff))); #endif #ifdef DEBUG /* Dump rho, uX, uY for the entire domain to verify results */ dumpVelocities(t); #endif return 0; }
#define S1(zT0,zT1,zT2,zT3,i,j) B[i][j]=A[i][j]+u1[i]*v1[j]+u2[i]*v2[j]; #define S2(zT0,zT1,zT2,zT3,i,j) x[i]=x[i]+beta*B[j][i]*y[j]; #define S3(zT0,zT1,zT2,zT3,i) x[i]=x[i]+z[i]; #define S4(zT0,zT1,zT2,zT3,i,j) w[i]=w[i]+alpha*B[i][j]*x[j]; int t0, t1, t2, t3, t4, t5, t6, t7; register int lb, ub, lb1, ub1, lb2, ub2; register int lbv, ubv; /* Generated from PLUTO-produced CLooG file by CLooG v0.14.1 64 bits in 0.03s. */ lb1=0; ub1=floord(N-1,8000); #pragma omp parallel for shared(t0,lb1,ub1) private(t1,t2,t3,t4,t5,t6,t7) for (t1=lb1; t1<=ub1; t1++) { for (t2=0;t2<=floord(N-1,256);t2++) { for (t3=max(20*t1,0);t3<=min(20*t1+19,floord(N-1,400));t3++) { for (t4=max(0,16*t2);t4<=min(16*t2+15,floord(N-1,16));t4++) { for (t5=max(0,16*t4);t5<=min(N-1,16*t4+15);t5++) { { lbv=max(0,400*t3); ubv=min(N-1,400*t3+399); #pragma ivdep #pragma vector always for (t6=lbv; t6<=ubv; t6++) { S1(t1,t2,t3,t4,t5,t6) ; S2(t1,t2,t3,t4,t6,t5) ; } } } }
for (int c1 = 5; c1 <= 5 * M; c1 += 1) { for (int c3 = max(2, floord(-M + c1, 4)); c3 < min(M, (c1 + 1) / 3 - 2); c3 += 1) for (int c5 = max(1, -M - c3 + (M + c1) / 2 - 2); c5 < min(c3, -2 * c3 + (c1 + c3) / 2 - 2); c5 += 1) S1(c1 - 2 * c3 - 2 * c5 - 5, c3, c5); for (int c3 = max(1, floord(-M + c1, 4)); c3 < (c1 + 1) / 5; c3 += 1) S2(c1 - 4 * c3 - 3, c3); if (c1 % 5 == 0) S4(c1 / 5); for (int c3 = max(-3 * M - c1 + 3 * ((M + c1) / 2) + 1, -((c1 - 1) % 3) + 3); c3 < (c1 + 1) / 5; c3 += 3) S3((c1 - 2 * c3 - 1) / 3, c3); }
if (n % 2 == 0) for (int c0 = (n / 2) + 2 * floord(-n - 1, 4) + 2; c0 <= 100; c0 += 2) S(c0);
void test(int n) { /* Scattering iterators. */ int t1, t2, t3; /* Original iterators. */ int i, j, k; if (n >= 1) { t1 = -n+1 ; t2 = n+1 ; for (t3=n+3;t3<=3*n+1;t3++) { if ((t3+n+1)%2 == 0) { k = (t3-n-1)/2 ; S1(1,n,(t3-n-1)/2) ; } } } if ((n >= 2) && (n <= 2)) { t1 = -n+2 ; for (t2=-n+4;t2<=3*n-2;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t2+n)%2 == 0) { i = (t2-n+2)/2 ; j = (t2+n-2)/2 ; if ((t3+n)%2 == 0) { k = (-t2+t3)/2 ; S1((t2-n+2)/2,(t2+n-2)/2,(-t2+t3)/2) ; } } } } t2 = n+3 ; for (t3=1;t3<=n;t3++) { S2(1,n,t3) ; } } if (n >= 3) { t1 = -n+2 ; for (t2=n;t2<=n+2;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t2+n)%2 == 0) { i = (t2-n+2)/2 ; j = (t2+n-2)/2 ; if ((t3+n)%2 == 0) { k = (-t2+t3)/2 ; S1((t2-n+2)/2,(t2+n-2)/2,(-t2+t3)/2) ; } } } } t2 = n+3 ; for (t3=1;t3<=n;t3++) { S2(1,n,t3) ; } } for (t1=ceild(-2*n+5,2);t1<=min(-n+6,-1);t1++) { for (t2=-t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } t2 = t1+2*n+1 ; for (t3=1;t3<=n;t3++) { i = t1+n-1 ; S2(t1+n-1,n,t3) ; } } if (n == 2) { for (t3=5;t3<=7;t3++) { if ((t3+1)%2 == 0) { k = (t3-3)/2 ; S1(2,1,(t3-3)/2) ; } } for (t2=4;t2<=6;t2++) { for (t3=1;t3<=2;t3++) { if (t2%2 == 0) { i = (t2-2)/2 ; j = (t2-2)/2 ; S2((t2-2)/2,(t2-2)/2,t3) ; } } } } for (t1=-n+7;t1<=-1;t1++) { for (t2=-t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=n-2;t2++) { for (t3=1;t3<=t2+1;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } for (t3=n+1;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=n-1;t2<=t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } t2 = t1+2*n+1 ; for (t3=1;t3<=n;t3++) { i = t1+n-1 ; S2(t1+n-1,n,t3) ; } } if (n >= 3) { for (t1=0;t1<=min(1,-n+6);t1++) { for (t2=t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=t1+2*n+1;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } } for (t1=max(-n+7,0);t1<=1;t1++) { for (t2=t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=n-2;t2++) { for (t3=1;t3<=t2+1;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } for (t3=n+1;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=n-1;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=t1+2*n+1;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } for (t1=2;t1<=n-5;t1++) { t2 = t1+2 ; for (t3=t1+4;t3<=t1+2*n+2;t3++) { i = t1+1 ; if ((t1+t3)%2 == 0) { k = (-t1+t3-2)/2 ; S1(t1+1,1,(-t1+t3-2)/2) ; } } for (t2=t1+3;t2<=n-2;t2++) { for (t3=1;t3<=t2+1;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } for (t3=n+1;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=n-1;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=-t1+2*n+3;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } for (t1=max(2,n-4);t1<=floord(2*n-3,2);t1++) { t2 = t1+2 ; for (t3=t1+4;t3<=t1+2*n+2;t3++) { i = t1+1 ; if ((t1+t3)%2 == 0) { k = (-t1+t3-2)/2 ; S1(t1+1,1,(-t1+t3-2)/2) ; } } for (t2=t1+3;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=-t1+2*n+3;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } if (n >= 3) { t1 = n-1 ; t2 = n+1 ; for (t3=n+3;t3<=3*n+1;t3++) { if ((t3+n+1)%2 == 0) { k = (t3-n-1)/2 ; S1(n,1,(t3-n-1)/2) ; } } for (t2=n+2;t2<=n+4;t2++) { for (t3=1;t3<=n;t3++) { if ((t2+n)%2 == 0) { i = (t2+n-4)/2 ; j = (t2-n)/2 ; S2((t2+n-4)/2,(t2-n)/2,t3) ; } } } } if (n >= 1) { t2 = n+3 ; for (t3=1;t3<=n;t3++) { S2(n,1,t3) ; } } }
for (int c0 = 0; c0 < m; c0 += 32) for (int c1 = (n >= 32 && m >= c0 + 2) || (m == 1 && c0 == 0) ? 0 : 32 * n - 32 * floord(31 * n + 31, 32); c1 <= ((n <= -1 && c0 == 0) || (m == 1 && n >= 0 && c0 == 0) ? max(0, n - 1) : n); c1 += 32) for (int c2 = c0; c2 <= (m >= 2 && c0 + 31 >= m && n >= c1 && c1 + 31 >= n ? 2 * m - 3 : (m >= 2 * c0 + 63 && c1 <= -32 && n >= c1 && c1 + 31 >= n) || (m >= c0 + 32 && 2 * c0 + 62 >= m && n >= c1 && c1 + 31 >= n) || (n >= 0 && c0 >= 32 && m >= 2 * c0 + 63 && c1 == n) || (m >= 63 && n >= 32 && c0 == 0 && c1 == n) ? 2 * c0 + 61 : m - 1); c2 += 32) { if (n >= c1 + 32 && c1 >= 0 && 2 * c0 >= c2 + 32) { for (int c4 = 0; c4 <= 31; c4 += 1) for (int c5 = max(0, c0 - c2 + 1); c5 <= min(31, m - c2 - 1); c5 += 1) S_27(c0, c2 + c5, c1 + c4); } else if (c0 >= 32 && c1 >= 0 && c2 >= 2 * c0) { for (int c4 = 0; c4 <= min(31, n - c1 - 1); c4 += 1) for (int c5 = 0; c5 <= min(31, m - c2 - 1); c5 += 1) S_27(c0, c2 + c5, c1 + c4); } else if (c0 == 0 && c1 >= 0) { for (int c4 = 0; c4 <= min(31, n - c1 - 1); c4 += 1) for (int c5 = 0; c5 <= min(31, m - c2 - 1); c5 += 1) { if (c1 == 0 && c4 == 0) S_14(c2 + c5); S_19(c1 + c4, c2 + c5); if (c2 + c5 >= 1) S_27(0, c2 + c5, c1 + c4); } } if (c1 >= 0) { for (int c3 = 1; c3 <= min(31, (c2 / 2) - c0); c3 += 1) for (int c4 = 0; c4 <= min(31, n - c1 - 1); c4 += 1) for (int c5 = 0; c5 <= min(31, m - c2 - 1); c5 += 1) S_27(c0 + c3, c2 + c5, c1 + c4); if (n >= c1 + 32) { for (int c3 = max(1, (c2 / 2) - c0 + 1); c3 <= min(min(31, m - c0 - 2), -c0 + c2 + 30); c3 += 1) for (int c4 = 0; c4 <= 31; c4 += 1) for (int c5 = max(0, c0 - c2 + c3 + 1); c5 <= min(31, m - c2 - 1); c5 += 1) S_27(c0 + c3, c2 + c5, c1 + c4);