void test(int M) { /* Scattering iterators. */ int c1, c2, c3, c4; /* Original iterators. */ int i, j, k, l; if (M >= 3) { for (c1=-1;c1<=min(2,floord(M+2,4));c1++) { for (c2=max(ceild(2*c1-M+1,4),ceild(4*c1-M-2,4));c2<=min(0,floord(c1,2));c2++) { for (c3=max(max(-4*c2-2,4*c2+3),4*c1-4*c2+1);c3<=min(min(min(M+3,-4*c2+9),4*c2+2*M),4*c1-4*c2+4);c3++) { for (c4=max(3*c3-4*floord(c3+M+1,2)+6,4*c2-c3-4*floord(-c3+1,4)+2);c4<=min(min(4*c2+4,-c3+10),c3-2);c4+=4) { if ((c2 <= floord(c4-1,4)) && (c2 >= ceild(c4-4,4))) { S1(c1-c2,c2,(c3+c4-2)/4,(c3-c4)/2); } } } } } } }
void test(int n) { /* Scattering iterators. */ int c1, c2; /* Original iterators. */ int i, j; for (c1=0;c1<=5*n;c1++) { for (c2=max(c1-n,ceild(2*c1,3));c2<=min(c1,floord(2*c1+2*n,3));c2++) { if (c2%2 == 0) { i = (-2*c1+3*c2)/2 ; j = c1-c2 ; S1((-2*c1+3*c2)/2,c1-c2) ; } } } }
void test(int outerTimeTileScatter, int outerProcTileScatter1, int outerProcTileScatter2, int M, int N) { /* Scattering iterators. */ int compScatter1, compScatter2, compScatter3; /* Original iterators. */ int compIter1, compIter2, compIter3, compIter4, compIter5, compIter6; if ((M >= 2) && (N >= 3) && (outerProcTileScatter1 >= outerProcTileScatter2) && (5*outerProcTileScatter1 <= M+2*N-4) && (5*outerProcTileScatter1 <= 5*outerProcTileScatter2+N+2) && (outerProcTileScatter2 >= 0) && (5*outerProcTileScatter2 <= M+N-2) && (outerTimeTileScatter >= outerProcTileScatter1) && (outerTimeTileScatter <= 2*outerProcTileScatter1) && (outerTimeTileScatter <= outerProcTileScatter1+outerProcTileScatter2+1) && (5*outerTimeTileScatter <= 2*M+2*N-6) && (5*outerTimeTileScatter <= 5*outerProcTileScatter1+M+2) && (5*outerTimeTileScatter >= 10*outerProcTileScatter1-2*N-2) && (5*outerTimeTileScatter <= 5*outerProcTileScatter2+M+N) && (5*outerTimeTileScatter >= 10*outerProcTileScatter2-N-3) && (5*outerTimeTileScatter <= 10*outerProcTileScatter2+N+3) && (5*outerTimeTileScatter >= 5*outerProcTileScatter1+5*outerProcTileScatter2-N-4)) { for (compScatter1=max(max(max(max(max(4,5*outerTimeTileScatter),5*outerProcTileScatter2+1),5*outerProcTileScatter1+5*outerProcTileScatter2-N),10*outerProcTileScatter1-2*N+2),10*outerProcTileScatter2-N+1);compScatter1<=min(min(min(min(min(5*outerTimeTileScatter+4,2*M+2*N-6),5*outerProcTileScatter1+M+2),5*outerProcTileScatter1+5*outerProcTileScatter2+5),5*outerProcTileScatter2+M+N),10*outerProcTileScatter2+N+3);compScatter1++) { for (compScatter2=max(max(max(max(ceild(compScatter1+4,2),5*outerProcTileScatter1),5*outerProcTileScatter2+1),compScatter1-M+2),compScatter1-5*outerProcTileScatter2-1);compScatter2<=min(min(min(min(floord(compScatter1+2*N-2,2),compScatter1),5*outerProcTileScatter1+4),compScatter1-5*outerProcTileScatter2+N),5*outerProcTileScatter2+N+2);compScatter2++) { for (compScatter3=max(max(5*outerProcTileScatter2,compScatter1-compScatter2+3),compScatter2-N+2);compScatter3<=min(min(compScatter2-1,5*outerProcTileScatter2+4),compScatter1-compScatter2+N);compScatter3++) { S1(compScatter1-compScatter2+1,-compScatter1+compScatter2+compScatter3-2,compScatter2-compScatter3,compScatter1,compScatter2,compScatter3); } } } } }
int main(void) { int t = 0, z, y, x, k; double total_lattice_pts = (double)nZ * (double)nY * (double)nX * (double)nTimesteps; /* For timekeeping */ int ts_return = -1; struct timeval start, end, result; double tdiff = 0.0; /* Compute values for global parameters */ omega = 2.0 / ((6.0 * sqrt(uTopX * uTopX + uTopY * uTopY) * (nX - 0.5) / re) + 1.0); printf( "3D Lid Driven Cavity simulation with D3Q19 lattice:\n" "\tscheme : 3-Grid, Fused, Pull\n" "\tgrid size : %d x %d x %d = %.2lf * 10^3 Cells\n" "\tnTimeSteps : %d\n" "\tRe : %.2lf\n" "\tuTopX : %.6lf\n" "\tuTopY : %.6lf\n" "\tomega : %.6lf\n", nX, nY, nZ, nX * nY * nZ / 1.0e3, nTimesteps, re, uTopX, uTopY, omega); /* Initialize all 19 PDFs for each point in the domain to w1, w2 or w3 * accordingly */ for (z = 0; z < nZ + 2 + 4; z++) { for (y = 0; y < nY + 2 + 2; y++) { for (x = 0; x < nX + 2 + 2; x++) { grid[0][z][y][x][0] = w1; grid[1][z][y][x][0] = w1; for (k = 1; k < 7; k++) { grid[0][z][y][x][k] = w2; grid[1][z][y][x][k] = w2; } for (k = 7; k < nK; k++) { grid[0][z][y][x][k] = w3; grid[1][z][y][x][k] = w3; } } } } /* To satisfy PET */ short _nX = nX + 3; short _nY = nY + 3; short _nZ = nZ + 4; short _nTimesteps = nTimesteps; #ifdef TIME gettimeofday(&start, 0); #endif int t1, t2, t3, t4, t5, t6, t7, t8; int lb, ub, lbp, ubp, lb2, ub2; register int lbv, ubv; /* Start of CLooG code */ if ((_nTimesteps >= 1) && (_nX >= 2) && (_nY >= 2) && (_nZ >= 3)) { for (t1 = -1; t1 <= floord(_nTimesteps - 1, 8); t1++) { lbp = max(ceild(t1, 2), ceild(16 * t1 - _nTimesteps + 3, 16)); ubp = min(floord(_nTimesteps + _nZ - 2, 16), floord(8 * t1 + _nZ + 6, 16)); #pragma omp parallel for private(lbv, ubv, t3, t4, t5, t6, t7, t8) for (t2 = lbp; t2 <= ubp; t2++) { for (t3 = max(max(0, ceild(t1 - 1, 2)), ceild(16 * t2 - _nZ - 13, 16)); t3 <= min(min(min(floord(_nTimesteps + _nY - 2, 16), floord(8 * t1 + _nY + 14, 16)), floord(16 * t2 + _nY + 12, 16)), floord(16 * t1 - 16 * t2 + _nZ + _nY + 13, 16)); t3++) { for (t4 = max( max(max(0, ceild(t1 - 1, 2)), ceild(16 * t2 - _nZ - 13, 16)), ceild(16 * t3 - _nY - 13, 16)); t4 <= min(min(min(min(floord(_nTimesteps + _nX - 2, 16), floord(8 * t1 + _nX + 14, 16)), floord(16 * t2 + _nX + 12, 16)), floord(16 * t3 + _nX + 13, 16)), floord(16 * t1 - 16 * t2 + _nZ + _nX + 13, 16)); t4++) { for (t5 = max(max(max(max(max(0, 8 * t1), 16 * t1 - 16 * t2 + 2), 16 * t2 - _nZ + 1), 16 * t3 - _nY + 1), 16 * t4 - _nX + 1); t5 <= min(min(min(min(min(_nTimesteps - 1, 8 * t1 + 15), 16 * t2 + 13), 16 * t3 + 14), 16 * t4 + 14), 16 * t1 - 16 * t2 + _nZ + 14); t5++) { /* Hoisted loop conditional */ if (t5 % 2 == 0) { for (t6 = max(max(16 * t2, t5 + 2), -16 * t1 + 16 * t2 + 2 * t5 - 15); t6 <= min(min(16 * t2 + 15, -16 * t1 + 16 * t2 + 2 * t5), t5 + _nZ - 1); t6++) { for (t7 = max(16 * t3, t5 + 1); t7 <= min(16 * t3 + 15, t5 + _nY - 1); t7++) { lbv = max(16 * t4, t5 + 1); ubv = min(16 * t4 + 15, t5 + _nX - 1); #pragma ivdep #pragma vector always for (t8 = lbv; t8 <= ubv; t8++) { lbm_kernel( grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) - 1][1], grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) + 1][2], grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8)][3], grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8)][4], grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8)][5], grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8)][6], grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) - 1] [7], grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) + 1] [8], grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) - 1] [9], grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) + 1] [10], grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) - 1] [11], grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) + 1] [12], grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) - 1] [13], grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) + 1] [14], grid[0][(-t5 + t6) - 1][(-t5 + t7) - 1][(-t5 + t8)] [15], grid[0][(-t5 + t6) - 1][(-t5 + t7) + 1][(-t5 + t8)] [16], grid[0][(-t5 + t6) + 1][(-t5 + t7) - 1][(-t5 + t8)] [17], grid[0][(-t5 + t6) + 1][(-t5 + t7) + 1][(-t5 + t8)] [18], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][1], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][2], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][3], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][4], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][5], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][6], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][7], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][8], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][9], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][10], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][11], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][12], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][13], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][14], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][15], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][16], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][17], &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][18], (t5), ((-t5 + t6)), ((-t5 + t7)), ((-t5 + t8))); ; } } } } else { for (t6 = max(max(16 * t2, t5 + 2), -16 * t1 + 16 * t2 + 2 * t5 - 15); t6 <= min(min(16 * t2 + 15, -16 * t1 + 16 * t2 + 2 * t5), t5 + _nZ - 1); t6++) { for (t7 = max(16 * t3, t5 + 1); t7 <= min(16 * t3 + 15, t5 + _nY - 1); t7++) { lbv = max(16 * t4, t5 + 1); ubv = min(16 * t4 + 15, t5 + _nX - 1); #pragma ivdep #pragma vector always for (t8 = lbv; t8 <= ubv; t8++) { lbm_kernel( grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) - 1][1], grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) + 1][2], grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8)][3], grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8)][4], grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8)][5], grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8)][6], grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) - 1] [7], grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) + 1] [8], grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) - 1] [9], grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) + 1] [10], grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) - 1] [11], grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) + 1] [12], grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) - 1] [13], grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) + 1] [14], grid[1][(-t5 + t6) - 1][(-t5 + t7) - 1][(-t5 + t8)] [15], grid[1][(-t5 + t6) - 1][(-t5 + t7) + 1][(-t5 + t8)] [16], grid[1][(-t5 + t6) + 1][(-t5 + t7) - 1][(-t5 + t8)] [17], grid[1][(-t5 + t6) + 1][(-t5 + t7) + 1][(-t5 + t8)] [18], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][1], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][2], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][3], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][4], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][5], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][6], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][7], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][8], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][9], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][10], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][11], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][12], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][13], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][14], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][15], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][16], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][17], &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][18], (t5), ((-t5 + t6)), ((-t5 + t7)), ((-t5 + t8))); ; } } } } /* end hoisted if */ } } } } } } /* End of CLooG code */ #ifdef TIME gettimeofday(&end, 0); ts_return = timeval_subtract(&result, &end, &start); tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6); printf("\tTime taken : %7.5lfm\n", tdiff / 60.0); printf("\tMLUPS : %7.5lf\n", (total_lattice_pts / (1.0e6 * tdiff))); #endif #ifdef DEBUG /* Dump rho, uX, uY for the entire domain to verify results */ dumpVelocities(t); #endif return 0; }
int main(void) { int t, y, x, k; double total_lattice_pts = (double)nY * (double)nX * (double)nTimesteps; /* For timekeeping */ int ts_return = -1; struct timeval start, end, result; double tdiff = 0.0; /* Compute values for global parameters */ omega = 1.0 / tau; circle_R2 = circle_radius * circle_radius; double rho_avg = (rho_in + rho_out) / 2.0; printf( "2D Flow Past Cylinder simulation with D2Q9 lattice:\n" "\tscheme : 2-Grid, Fused, Pull\n" "\tgrid size : %d x %d = %.2lf * 10^3 Cells\n" "\tnTimeSteps : %d\n" "\tomega : %.6lf\n", nX, nY, nX * nY / 1.0e3, nTimesteps, omega); /* Initialize all 9 PDFs for each point in the domain to 1.0 */ for (y = 0; y < nY + 2 + 4; y++) { for (x = 0; x < nX + 2 + 2; x++) { grid[0][y][x][0] = w1 * rho_avg; grid[1][y][x][0] = w1 * rho_avg; for (k = 1; k < 5; k++) { grid[0][y][x][k] = w2 * rho_avg; grid[1][y][x][k] = w2 * rho_avg; } for (k = 5; k < nK; k++) { grid[0][y][x][k] = w3 * rho_avg; grid[1][y][x][k] = w3 * rho_avg; } } } /* To satisfy PET */ short _nX = nX + 2; short _nY = nY + 3; int _nTimesteps = nTimesteps; #ifdef TIME gettimeofday(&start, 0); #endif int t1, t2, t3, t4, t5, t6; int lb, ub, lbp, ubp, lb2, ub2; register int lbv, ubv; /* Start of CLooG code */ if ((_nTimesteps >= 1) && (_nX >= 3) && (_nY >= 4)) { for (t1 = -1; t1 <= floord(5 * _nTimesteps + 3 * _nY - 8, 32); t1++) { lbp = max(max(ceild(4 * t1, 5), ceild(16 * t1 - _nTimesteps + 1, 12)), ceild(32 * t1 - _nTimesteps + 4, 32)); ubp = min(min(floord(4 * t1 + 4, 3), floord(_nTimesteps + _nY - 2, 8)), floord(16 * t1 + _nY + 14, 20)); #pragma omp parallel for private(lbv, ubv, t3, t4, t5, t6) for (t2 = lbp; t2 <= ubp; t2++) { for (t3 = max(max(0, ceild(4 * t1 - 3 * t2 - 1, 2)), ceild(8 * t2 - _nY - 4, 8)); t3 <= min(min(min(floord(_nTimesteps + _nX - 2, 8), floord(8 * t2 + _nX + 3, 8)), floord(16 * t1 - 12 * t2 + _nX + 18, 8)), floord(32 * t1 - 32 * t2 + _nY + _nX + 29, 8)); t3++) { for (t4 = max( max(max(max(0, 16 * t1 - 12 * t2), 32 * t1 - 32 * t2 + 3), 8 * t2 - _nY + 1), 8 * t3 - _nX + 1); t4 <= min(min(min(min(_nTimesteps - 1, 8 * t2 + 4), 8 * t3 + 5), 16 * t1 - 12 * t2 + 19), 32 * t1 - 32 * t2 + _nY + 30); t4++) { /* Hoisted loop conditional */ if (t4 % 2 == 0) { for (t5 = max(max(8 * t2, t4 + 3), -32 * t1 + 32 * t2 + 2 * t4 - 31); t5 <= min(min(8 * t2 + 7, -32 * t1 + 32 * t2 + 2 * t4), t4 + _nY - 1); t5++) { lbv = max(8 * t3, t4 + 2); ubv = min(8 * t3 + 7, t4 + _nX - 1); #pragma ivdep #pragma vector always for (t6 = lbv; t6 <= ubv; t6++) { lbm_kernel(grid[0][(-t4 + t5)][(-t4 + t6)][0], grid[0][(-t4 + t5) - 1][(-t4 + t6)][3], grid[0][(-t4 + t5) + 1][(-t4 + t6)][4], grid[0][(-t4 + t5)][(-t4 + t6) - 1][1], grid[0][(-t4 + t5)][(-t4 + t6) + 1][2], grid[0][(-t4 + t5) - 1][(-t4 + t6) - 1][5], grid[0][(-t4 + t5) - 1][(-t4 + t6) + 1][6], grid[0][(-t4 + t5) + 1][(-t4 + t6) - 1][7], grid[0][(-t4 + t5) + 1][(-t4 + t6) + 1][8], &grid[1][(-t4 + t5)][(-t4 + t6)][0], &grid[1][(-t4 + t5)][(-t4 + t6)][3], &grid[1][(-t4 + t5)][(-t4 + t6)][4], &grid[1][(-t4 + t5)][(-t4 + t6)][1], &grid[1][(-t4 + t5)][(-t4 + t6)][2], &grid[1][(-t4 + t5)][(-t4 + t6)][5], &grid[1][(-t4 + t5)][(-t4 + t6)][6], &grid[1][(-t4 + t5)][(-t4 + t6)][7], &grid[1][(-t4 + t5)][(-t4 + t6)][8], (t4), ((-t4 + t5)), ((-t4 + t6))); ; } } } else { for (t5 = max(max(8 * t2, t4 + 3), -32 * t1 + 32 * t2 + 2 * t4 - 31); t5 <= min(min(8 * t2 + 7, -32 * t1 + 32 * t2 + 2 * t4), t4 + _nY - 1); t5++) { lbv = max(8 * t3, t4 + 2); ubv = min(8 * t3 + 7, t4 + _nX - 1); #pragma ivdep #pragma vector always for (t6 = lbv; t6 <= ubv; t6++) { lbm_kernel(grid[1][(-t4 + t5)][(-t4 + t6)][0], grid[1][(-t4 + t5) - 1][(-t4 + t6)][3], grid[1][(-t4 + t5) + 1][(-t4 + t6)][4], grid[1][(-t4 + t5)][(-t4 + t6) - 1][1], grid[1][(-t4 + t5)][(-t4 + t6) + 1][2], grid[1][(-t4 + t5) - 1][(-t4 + t6) - 1][5], grid[1][(-t4 + t5) - 1][(-t4 + t6) + 1][6], grid[1][(-t4 + t5) + 1][(-t4 + t6) - 1][7], grid[1][(-t4 + t5) + 1][(-t4 + t6) + 1][8], &grid[0][(-t4 + t5)][(-t4 + t6)][0], &grid[0][(-t4 + t5)][(-t4 + t6)][3], &grid[0][(-t4 + t5)][(-t4 + t6)][4], &grid[0][(-t4 + t5)][(-t4 + t6)][1], &grid[0][(-t4 + t5)][(-t4 + t6)][2], &grid[0][(-t4 + t5)][(-t4 + t6)][5], &grid[0][(-t4 + t5)][(-t4 + t6)][6], &grid[0][(-t4 + t5)][(-t4 + t6)][7], &grid[0][(-t4 + t5)][(-t4 + t6)][8], (t4), ((-t4 + t5)), ((-t4 + t6))); ; } } } /* end hoisted if */ } } } } } /* End of CLooG code */ #ifdef TIME gettimeofday(&end, 0); ts_return = timeval_subtract(&result, &end, &start); tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6); printf("\tTime taken : %7.5lfs\n", tdiff); printf("\tMLUPS : %7.5lf\n", (total_lattice_pts / (1.0e6 * tdiff))); #endif #ifdef DEBUG /* Dump rho, uX, uY for the entire domain to verify results */ dumpVelocities(t); #endif return 0; }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); #include <math.h> #include <assert.h> #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) #define S1(zT0,zT1,zT2,zT3,zT4,zT5,t,i,j) {A[i][j]=(A[1+i][1+j]+A[1+i][j]+A[1+i][j-1]+A[i][1+j]+A[i][j]+A[i][j-1]+A[i-1][1+j]+A[i-1][j]+A[i-1][j-1])/9;} int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lb, ub, lb1, ub1, lb2, ub2; register int lbv, ubv; omp_set_nested(1); omp_set_num_threads(2); /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 5.45s. */ for (c1=-2;c1<=floord(4*T+3*N-10,256);c1++) { lb1=max(max(max(0,ceild(256*c1-2*T-N-251,512)),ceild(256*c1-3*T-2*N+7,256)),ceild(256*c1-N-761,1024)); ub1=min(min(min(floord(256*c1+2*N+505,1024),floord(256*c1+509,512)),floord(64*c1+127,64)),floord(T+N-3,256)); #pragma omp parallel for shared(c1,lb1,ub1) private(lb2,ub2,c2,c3,c4,c5,c6,c7,c8,c9) for (c2=lb1; c2<=ub1; c2++) { lb2=max(max(max(max(max(max(ceild(256*c1-256*c2-T+1,256),ceild(512*c1-512*c2-253,768)),0),ceild(512*c2-N-252,256)),ceild(128*c1-256*c2-127,128)),ceild(128*c2-127,128)),ceild(128*c1-127,256)); ub2=min(min(min(min(min(min(floord(256*c1-256*c2+255,256),floord(256*c1-512*c2+N+253,256)),floord(256*c2+T+N+252,256)),floord(T+N-3,128)),floord(256*c1+N+508,512)),floord(256*c1-256*c2+N+253,384)),floord(512*c2+N+507,256)); #pragma omp parallel for shared(c1,c2,lb1,ub1,lb2,ub2) private(c3,c4,c5,c6,c7,c8,c9) for (c3=lb2; c3<=ub2; c3++) { for (c4=max(max(max(max(0,ceild(-256*c2+256*c3-N-284,32)),8*c1-8*c2-8*c3),ceild(256*c2-N-29,32)),ceild(128*c3-N-29,32));c4<=min(min(min(min(8*c1-8*c2-8*c3+7,floord(256*c3+253,64)),floord(T-1,32)),floord(128*c2+127,16)),floord(-128*c2+128*c3+127,16));c4++) { for (c5=max(max(max(max(max(8*c2,ceild(16*c4-15,16)),ceild(256*c3-T-N-28,32)),0),ceild(256*c3-32*c4-N-60,32)),ceild(256*c3-N-59,64));c5<=min(min(min(min(min(floord(32*c4+N+29,32),floord(128*c3+127,16)),8*c2+7),floord(128*c3-16*c4+127,16)),floord(T+N-3,32)),floord(256*c3+N+252,64));c5++) { for (c6=max(max(max(max(max(ceild(64*c4-29,32),8*c3),ceild(16*c5-15,16)),ceild(16*c4+16*c5-15,16)),0),ceild(64*c5-N-28,32));c6<=min(min(min(min(min(8*c3+7,floord(T+N-3,16)),floord(32*c4+32*c5+N+60,32)),floord(32*c4+N+29,16)),floord(64*c5+N+59,32)),floord(32*c5+T+N+28,32));c6++) { for (c7=max(max(max(max(0,32*c4),32*c5-N+2),16*c6-N+2),-32*c5+32*c6-N-29);c7<=min(min(min(min(-32*c5+32*c6+30,floord(32*c6+29,2)),T-1),32*c5+30),32*c4+31);c7++) { /*@ begin Loop( transform UnrollJam(ufactor=8) for (c8=max(max(32*c5,c7+1),32*c6-c7-N+2);c8<=min(min(32*c6-c7+30,32*c5+31),c7+N-2);c8++) transform Unroll(ufactor=8) for (c9=max(c7+c8+1,32*c6);c9<=min(32*c6+31,c7+c8+N-2);c9++) { S1(c1-c2-c3,-c1+2*c2+c3,-c1+2*c3,c4,-c4+c5,-c4-c5+c6,c7,-c7+c8,-c7-c8+c9) ; } ) @*/ for (c8=max(max(32*c5,c7+1),32*c6-c7-N+2);c8<=min(min(32*c6-c7+30,32*c5+31),c7+N-2);c8++) { for (c9=max(c7+c8+1,32*c6);c9<=min(32*c6+31,c7+c8+N-2);c9++) { S1(c1-c2-c3,-c1+2*c2+c3,-c1+2*c3,c4,-c4+c5,-c4-c5+c6,c7,-c7+c8,-c7-c8+c9) ; } } /*@ end @*/ } } } } } } } /* End of CLooG code */ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return ((int) A[0][0]); }
/* Generated from ./non_optimal/nul_complex1.cloog by CLooG 0.18.1-2-g43fc508 gmp bits in 0.00s. */ if (n >= 0) { for (c1=0;c1<=5*n;c1++) { for (c2=max(ceild(2*c1,3),c1-n);c2<=min(floord(2*c1+2*n,3),c1);c2++) { if (c2%2 == 0) { S1(((-2*c1+3*c2)/2),(c1-c2)); } } } }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); int t, i, j, k, l, m, n,ii,jj; #define S1(zT0,zT1,t,j) {ey[0][j]=t;} #define S2(zT0,zT1,zT2,t,i,j) {ey[i][j]=ey[i][j]-((double)(1))/2*(hz[i][j]-hz[i-1][j]);} #define S3(zT0,zT1,zT2,t,i,j) {ex[i][j]=ex[i][j]-((double)(1))/2*(hz[i][j]-hz[i][j-1]);} #define S4(zT0,zT1,zT2,t,i,j) {hz[i][j]=hz[i][j]-((double)(7))/10*(ey[1+i][j]+ex[i][1+j]-ex[i][j]-ey[i][j]);} int c1, c2, c3, c4, c5, c6, c7; register int lbv, ubv; for (c1=0;c1<=floord(tmax-1,32);c1++) { for (c2=max(ceild(32*c1-31,32),0);c2<=min(floord(tmax+ny-1,32),floord(32*c1+ny+31,32));c2++) { for (c3=max(max(max(max(ceild(32*c2-ny-30,32),0),ceild(64*c1-32*c2-61,32)),ceild(32*c1-31,32)),ceild(32*c1-992*c2-1891,992));c3<=min(min(floord(32*c2+nx+30,32),floord(tmax+nx-1,32)),floord(32*c1+nx+31,32));c3++) { if ((c1 <= floord(32*c3-nx,32)) && (c2 <= floord(32*c3-nx+ny,32)) && (c3 >= ceild(nx,32))) { for (c5=max(32*c3-nx+1,32*c2);c5<=min(32*c2+31,32*c3-nx+ny);c5++) { S4(c1,-c1+c3,-c1+c2,32*c3-nx,nx-1,-32*c3+c5+nx-1) ; } } if ((c1 <= floord(32*c2-ny,32)) && (c2 >= max(ceild(32*c3-nx+ny+1,32),ceild(ny,32)))) { for (c6=max(32*c3,32*c2-ny+1);c6<=min(32*c2+nx-ny,32*c3+31);c6++) { S4(c1,-c1+c3,-c1+c2,32*c2-ny,-32*c2+c6+ny-1,ny-1) ; } } if (c1 == c3) { for (c4=max(max(32*c2-ny+1,0),32*c3);c4<=min(min(32*c3+30,32*c2-ny+31),tmax-1);c4++) { for (c5=32*c2;c5<=c4+ny-1;c5++) { S1(c1,-c1+c2,c4,-c4+c5) ; S3(c1,0,-c1+c2,c4,0,-c4+c5) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } for (c6=c4+1;c6<=32*c3+31;c6++) { S4(c1,0,-c1+c2,c4,-c4+c6-1,ny-1) ; } } } if (c1 == c3) { for (c4=max(max(0,32*c3),32*c2-ny+32);c4<=min(min(tmax-1,32*c3+30),32*c2-1);c4++) { for (c5=32*c2;c5<=32*c2+31;c5++) { S1(c1,-c1+c2,c4,-c4+c5) ; S3(c1,0,-c1+c2,c4,0,-c4+c5) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } } } if (c1 == c3) { for (c4=max(max(32*c2,0),32*c3);c4<=min(min(tmax-1,32*c3+30),32*c2+30);c4++) { S1(c1,-c1+c2,c4,0) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,0) ; } for (c5=c4+1;c5<=32*c2+31;c5++) { S1(c1,-c1+c2,c4,-c4+c5) ; S3(c1,0,-c1+c2,c4,0,-c4+c5) ; for (c6=c4+1;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } } } for (c4=max(max(max(32*c1,0),32*c2-ny+1),32*c3-nx+1);c4<=min(min(min(32*c3-nx+31,32*c2-ny+31),32*c1+31),tmax-1);c4++) { for (c5=32*c2;c5<=c4+ny-1;c5++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ; } for (c6=32*c3;c6<=c4+nx;c6++) { S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,ny-1) ; } } for (c4=max(max(max(32*c1,0),32*c3-nx+1),32*c2-ny+32);c4<=min(min(min(tmax-1,32*c1+31),32*c2-1),32*c3-nx+31);c4++) { for (c5=32*c2;c5<=32*c2+31;c5++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ; } } for (c4=max(max(max(32*c3-nx+32,32*c1),0),32*c2-ny+1);c4<=min(min(min(32*c2-ny+31,32*c1+31),tmax-1),32*c3-1);c4++) { for (c5=32*c2;c5<=c4+ny-1;c5++) { for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } for (c6=32*c3;c6<=32*c3+31;c6++) { S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,ny-1) ; } } for (c4=max(max(max(32*c2,32*c1),0),32*c3-nx+1);c4<=min(min(min(32*c2+30,tmax-1),32*c1+31),32*c3-nx+31);c4++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,0) ; } for (c5=c4+1;c5<=32*c2+31;c5++) { for (c6=32*c3;c6<=c4+nx-1;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ; } } for (c4=max(max(max(32*c1,0),32*c3-nx+32),32*c2-ny+32);c4<=min(min(min(32*c3-1,tmax-1),32*c1+31),32*c2-1);c4++) { /*@ begin Loop( transform Composite( tile = [('c5',T1,'ii'),('c6',T2,'jj')], permut = [PERMUTS], unrolljam = [('c5',U1),('c6',U2)], vector = (VEC, ['ivdep','vector always']) ) for (c5=32*c2;c5<=32*c2+31;c5++) for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } ) @*/{ for (c6=32*c3; c6<=32*c3+28; c6=c6+4) { register int cbv_1, cbv_2; cbv_1=32*c2; cbv_2=32*c2+31; #pragma ivdep #pragma vector always for (c5=cbv_1; c5<=cbv_2; c5=c5+1) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5); S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5); S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+3,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+3,-c4+c5); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5-1); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5-1); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5-1); } } for (; c6<=32*c3+31; c6=c6+1) { register int cbv_3, cbv_4; cbv_3=32*c2; cbv_4=32*c2+31; #pragma ivdep #pragma vector always for (c5=cbv_3; c5<=cbv_4; c5=c5+1) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5); S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1); } } } /*@ end @*/ } for (c4=max(max(max(32*c2,32*c3-nx+32),32*c1),0);c4<=min(min(min(32*c3-1,32*c2+30),tmax-1),32*c1+31);c4++) { for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,0) ; } for (c5=c4+1;c5<=32*c2+31;c5++) { for (c6=32*c3;c6<=32*c3+31;c6++) { S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ; S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ; } } } if ((c1 == c3) && (c2 <= min(floord(32*c3-1,32),floord(tmax-32,32)))) { S1(c1,-c1+c2,32*c2+31,0) ; for (c6=32*c2+32;c6<=32*c3+31;c6++) { S2(c1,0,-c1+c2,32*c2+31,-32*c2+c6-31,0) ; } } if ((-c1 == -c3) && (c1 >= ceild(32*c2-31,32)) && (c1 <= min(floord(tmax-32,32),floord(32*c2-1,32)))) { S1(c1,-c1+c2,32*c1+31,0) ; for (c5=32*c1+32;c5<=32*c2+31;c5++) { S1(c1,-c1+c2,32*c1+31,-32*c1+c5-31) ; S3(c1,0,-c1+c2,32*c1+31,0,-32*c1+c5-31) ; } } if ((-c1 == -c3) && (c1 <= min(floord(tmax-32,32),c2-1))) { for (c5=32*c2;c5<=min(32*c2+31,32*c1+ny+30);c5++) { S1(c1,-c1+c2,32*c1+31,-32*c1+c5-31) ; S3(c1,0,-c1+c2,32*c1+31,0,-32*c1+c5-31) ; } } if ((-c1 == -c2) && (-c1 == -c3) && (c1 <= floord(tmax-32,32))) { S1(c1,0,32*c1+31,0) ; } if ((c1 >= c2) && (c2 <= min(c3-1,floord(tmax-32,32)))) { for (c6=32*c3;c6<=min(32*c2+nx+30,32*c3+31);c6++) { S2(c1,-c1+c3,-c1+c2,32*c2+31,-32*c2+c6-31,0) ; } } } } } annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return 1; }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; omp_set_nested(1); omp_set_num_threads(2); for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); register int i,j,k; #define S1(zT0,zT1,zT2,zT3,k,j) {A[k][j]=A[k][j]/A[k][k];} #define S2(zT0,zT1,zT2,zT3,zT4,zT5,k,i,j) {A[i][j]=A[i][j]-A[i][k]*A[k][j];} int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lb, ub, lb1, ub1, lb2, ub2; register int lbv, ubv; /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 2.21s. */ for (c1=-2;c1<=floord(3*N-4,256);c1++) { lb1=max(max(0,ceild(256*c1-N-253,512)),ceild(256*c1-2*N+3,256)); ub1=min(floord(128*c1+255,128),floord(N-1,256)); #pragma omp parallel for shared(c1,lb1,ub1) private(lb2,ub2,c2,c3,c4,c5,c6,c7,c8,c9) for (c2=lb1; c2<=ub1; c2++) { lb2=max(max(max(ceild(256*c1-256*c2-N+2,256),ceild(128*c1-256*c2-127,128)),ceild(128*c1-128*c2-32385,32768)),ceild(128*c1-128*c2-127,256)); ub2=min(floord(N-1,256),floord(256*c1-256*c2+255,256)); #pragma omp parallel for shared(c1,c2,lb1,ub1,lb2,ub2) private(c3,c4,c5,c6,c7,c8,c9) for (c3=lb2; c3<=ub2; c3++) { for (c4=max(max(8*c1-8*c2-8*c3,0),8*c1-8*c2-1800*c3-1778);c4<=min(min(min(min(floord(3968*c3+3937,16),8*c1-8*c2-8*c3+7),floord(128*c2+127,16)),floord(N-2,32)),floord(128*c3+127,16));c4++) { for (c5=max(max(ceild(16*c4-15,16),0),8*c2);c5<=min(floord(N-1,32),8*c2+7);c5++) { for (c6=max(max(max(max(ceild(16*c4-465,496),ceild(8*c1-8*c2-16*c3-c4-217,223)),ceild(-8*c1+8*c2+16*c3+c4-217,225)),8*c3),ceild(16*c4-15,16));c6<=min(8*c3+7,floord(N-1,32));c6++) { if ((c1 == c2+2*c3) && (c4 == c6)) { for (c7=max(0,32*c6);c7<=min(min(32*c5+30,32*c6+30),N-2);c7++) { for (c8=max(c7+1,32*c5);c8<=min(32*c5+31,N-1);c8++) { if ((c1-c2)%2 == 0) { S1((c1-c2)/2,c2,c4,c5,c7,c8) ; } for (c9=c7+1;c9<=min(32*c6+31,N-1);c9++) { if ((c1-c2)%2 == 0) { if ((c1-c2)%2 == 0) { S2((c1-c2)/2,(c1-c2)/2,c2,c4,c4,c5,c7,c9,c8) ; } } } } } } for (c7=max(32*c4,0);c7<=min(min(32*c6-1,32*c5+30),32*c4+31);c7++) { /*@ begin Loop( transform UnrollJam(ufactor=8) for (c8=max(c7+1,32*c5);c8<=min(32*c5+31,N-1);c8++) transform Unroll(ufactor=8) for (c9=32*c6;c9<=min(N-1,32*c6+31);c9++) { S2(c1-c2-c3,c3,c2,c4,c6,c5,c7,c9,c8) ; } ) @*/{ for (c8 = max(c7 + 1, 32 * c5); c8 <= min(32 * c5 + 31, N - 1) - 7; c8 = c8 + 8) { for (c9 = 32 * c6; c9 <= min(N - 1, 32 * c6 + 31) - 7; c9 = c9 + 8) { S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 7)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 7)); } for (; c9 <= min(N - 1, 32 * c6 + 31); c9 = c9 + 1) { S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 1)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 2)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 3)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 4)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 5)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 6)); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 7)); } } for (; c8 <= min(32 * c5 + 31, N - 1); c8 = c8 + 1) { for (c9 = 32 * c6; c9 <= min(N - 1, 32 * c6 + 31) - 7; c9 = c9 + 8) { S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), c8); S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), c8); } for (; c9 <= min(N - 1, 32 * c6 + 31); c9 = c9 + 1) S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); } } /*@ end @*/ } if ((c1 == c2+2*c3) && (-c4 == -c6) && (c4 <= min(floord(N-33,32),floord(32*c5-1,32)))) { for (c8=max(32*c5,32*c4+32);c8<=min(N-1,32*c5+31);c8++) { if ((c1-c2)%2 == 0) { S1((c1-c2)/2,c2,c4,c5,32*c4+31,c8) ; } } } } } } } } } /* End of CLooG code */ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; printf("%f\n", annot_t_total); return ((int) A[0][0]); }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); #include <math.h> #include <assert.h> #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lbv, ubv; if (N >= 2) { for (c1=0;c1<=floord(N-2,256);c1++) { for (c2=max(ceild(128*c1-127,128),0);c2<=floord(N-1,256);c2++) { for (c3=max(ceild(128*c1-32385,32640),ceild(128*c1-127,128));c3<=floord(N-1,256);c3++) { for (c4=max(max(8*c1-1792*c3-1778,0),8*c1);c4<=min(min(min(min(floord(N-2,32),floord(128*c2+127,16)),8*c1+7),floord(128*c3+127,16)),floord(3968*c3+3937,16));c4++) { for (c5=max(max(0,ceild(16*c4-15,16)),8*c2);c5<=min(floord(N-1,32),8*c2+7);c5++) { for (c6=max(max(max(max(ceild(16*c4-465,496),ceild(-8*c1+8*c3+c4-217,225)),ceild(8*c1-8*c3-c4-217,223)),8*c3),ceild(16*c4-15,16));c6<=min(8*c3+7,floord(N-1,32));c6++) { if ((c1 == c3) && (c4 == c6)) { for (c7=max(0,32*c6);c7<=min(min(32*c5+30,N-2),32*c6+30);c7++) { { lbv=max(c7+1,32*c5); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c7][c9]=A[c7][c9]/A[c7][c7];} ; } } for (c8=c7+1;c8<=min(N-1,32*c6+31);c8++) { { lbv=max(c7+1,32*c5); ubv=min(32*c5+31,N-1); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];} ; } } } } } { for (c7 = max(0, 32 * c4); c7 <= min(min(32 * c6 - 1, 32 * c5 + 30), 32 * c4 + 31) - 3; c7 = c7 + 4) { for (c8 = 32 * c6; c8 <= min(N - 1, 32 * c6 + 31) - 3; c8 = c8 + 4) { { lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];}; {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][c7]*A[c7][c9];}; {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][c7]*A[c7][c9];}; {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][c7]*A[c7][c9];}; } } { lbv=max(32*c5,(c7+1)+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 1)]*A[(c7 + 1)][c9];}; {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][(c7 + 1)]*A[(c7 + 1)][c9];}; {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][(c7 + 1)]*A[(c7 + 1)][c9];}; {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][(c7 + 1)]*A[(c7 + 1)][c9];}; } } { lbv=max(32*c5,(c7+2)+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 2)]*A[(c7 + 2)][c9];}; {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][(c7 + 2)]*A[(c7 + 2)][c9];}; {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][(c7 + 2)]*A[(c7 + 2)][c9];}; {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][(c7 + 2)]*A[(c7 + 2)][c9];}; } } { lbv=max(32*c5,(c7+3)+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 3)]*A[(c7 + 3)][c9];}; {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][(c7 + 3)]*A[(c7 + 3)][c9];}; {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][(c7 + 3)]*A[(c7 + 3)][c9];}; {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][(c7 + 3)]*A[(c7 + 3)][c9];}; } } } for (; c8 <= min(N - 1, 32 * c6 + 31); c8 = c8 + 1) { { lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];}; } } { lbv=max(32*c5,(c7+1)+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 1)]*A[(c7 + 1)][c9];}; } } { lbv=max(32*c5,(c7+2)+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 2)]*A[(c7 + 2)][c9];}; } } { lbv=max(32*c5,(c7+3)+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 3)]*A[(c7 + 3)][c9];}; } } } } for (; c7 <= min(min(32 * c6 - 1, 32 * c5 + 30), 32 * c4 + 31); c7 = c7 + 1) { for (c8 = 32 * c6; c8 <= min(N - 1, 32 * c6 + 31) - 3; c8 = c8 + 4) { lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];}; {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][c7]*A[c7][c9];}; {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][c7]*A[c7][c9];}; {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][c7]*A[c7][c9];}; } } for (; c8 <= min(N - 1, 32 * c6 + 31); c8 = c8 + 1) { lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];}; } } } } if ((c1 == c3) && (-c4 == -c6) && (c4 <= min(floord(N-33,32),floord(32*c5-1,32)))) { { lbv=max(32*c4+32,32*c5); ubv=min(N-1,32*c5+31); #pragma ivdep #pragma vector always for (c9=lbv; c9<=ubv; c9++) { {A[32*c4+31][c9]=A[32*c4+31][c9]/A[32*c4+31][32*c4+31];} ; } } } } } } } } } } annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; #ifndef TEST printf("%f\n", annot_t_total); #else { int i, j; for (i=0; i<N; i++) { for (j=0; j<N; j++) { if (j%100==0) printf("\n"); printf("%f ",A[i][j]); } printf("\n"); } } #endif return ((int) A[0][0]); }
int inspectBlock(SpMatrix *m, unsigned int **rowIndices, unsigned int **indices, unsigned int *numblocks, unsigned int **nnzCount_block, unsigned int **yCount_block, unsigned int bsx, unsigned int bsy) { unsigned int nblocks=0; unsigned int nrows = m->numRows; unsigned int ncols = m->numCols; unsigned int *yflag = (unsigned int *)malloc(sizeof(int)*bsy); unsigned int *yBFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy)); unsigned int *lb = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *ub = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *bptr = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *colFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy)); unsigned int *nblocksRow = (unsigned int *)malloc(sizeof(int)*(int)ceild(nrows,bsx)); unsigned int **indRows = (unsigned int **)malloc(sizeof(int*)*(int)ceild(nrows,bsx)); *rowIndices = (unsigned int *)malloc(sizeof(int)*((int)ceild(nrows,bsx)+1)); if (*rowIndices == NULL) return ERR_INSUFFICIENT_MEMORY; // for each block of row unsigned int it, iti; for (it = 0, iti = 0; it < nrows; it += bsx, iti++) { // start of a row block (*rowIndices)[iti]=nblocks; nblocksRow[iti]=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { lb[i-it] = (m->rowPtrs)[i]; if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1; else ub[i-it] = (m->rowPtrs)[i+1]-1; bptr[i-it] = lb[i-it]; } // for each block of column within a row block for (unsigned int jt = 0, jti = 0; jt < ncols; jt += bsy, jti++) { colFlag[jti]=0; yBFlag[jti]=0; for (unsigned int k=0;k<bsy;k++) yflag[k]=0; unsigned int blockStart = nblocks; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { unsigned int j = bptr[i-it]; for (; j <= ub[i-it]; j++) { unsigned int cInd = (m->nzentries)[j].colNum; if (cInd >= jt+bsy) break; else { if (!yflag[cInd-jt]) yflag[cInd-jt]++; colFlag[jti]++; //colFlag[jti] stores the #nnzs in the block if (blockStart == nblocks) { nblocks++; nblocksRow[iti]++; } } } bptr[i-it] = j; } for (unsigned int k=0;k<bsy;k++) yBFlag[jti] += yflag[k]; } indRows[iti] = (unsigned int *)malloc(sizeof(int)*3*nblocksRow[iti]); if (indRows[iti] == NULL) return ERR_INSUFFICIENT_MEMORY; for (unsigned int k=0, indRowk=0; k < ceild(ncols,bsy); k++) { if (colFlag[k]) { indRows[iti][3*indRowk]=k; indRows[iti][3*indRowk+1]=colFlag[k]; indRows[iti][3*indRowk+2]=yBFlag[k]; indRowk++; } // indRowk at the end of the for loop will be equal to nblocksRow[iti] } } (*rowIndices)[iti]=nblocks; *numblocks = nblocks; *indices = (unsigned int *)malloc(sizeof(int)*nblocks); if (*indices == NULL) return ERR_INSUFFICIENT_MEMORY; *nnzCount_block = (unsigned int *)malloc(sizeof(int)*nblocks); if (*nnzCount_block == NULL) return ERR_INSUFFICIENT_MEMORY; *yCount_block = (unsigned int *)malloc(sizeof(int)*nblocks); if (*yCount_block == NULL) return ERR_INSUFFICIENT_MEMORY; // Merge all indRows to generate indices, nnzCount_block and yCount_block nblocks=0; for (unsigned int k=0; k < ceild(nrows,bsx); k++) { for (unsigned int l=0; l<nblocksRow[k]; l++) { (*indices)[nblocks]=indRows[k][3*l]; (*nnzCount_block)[nblocks]=indRows[k][3*l+1]; (*yCount_block)[nblocks]=indRows[k][3*l+2]; nblocks++; } } for (unsigned int k=0; k < ceild(nrows,bsx); k++) free(indRows[k]); free(nblocksRow); free(indRows); free(lb); free(ub); free(bptr); free(colFlag); free(yflag); free(yBFlag); return 0; }
int inspectInputBlock(SpMatrix *m, unsigned int **inputList, unsigned int **rowIndices, unsigned int **indices, unsigned int *numblocks, unsigned int *inputListCount, unsigned int bsx, unsigned int bsy) { unsigned int nblocks=0, nblocksPerRowBlock; unsigned int nrows = m->numRows; unsigned int ncols = m->numCols; unsigned int *lb = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *ub = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *bptr = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *inpListRowBlock = (unsigned int *)malloc(sizeof(int)*bsx*bsy); *rowIndices = (unsigned int *)malloc(sizeof(int)*((int)ceild(nrows,bsx)+1)); unsigned int it, iti; unsigned int maxNZPerRowBlock; // for each block of row for (it = 0, iti = 0; it < nrows; it += bsx, iti++) { // start of a row block (*rowIndices)[iti]=nblocks; maxNZPerRowBlock=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { if (i==(nrows-1)) maxNZPerRowBlock = max(maxNZPerRowBlock,(m->numNZEntries-(m->rowPtrs)[i])); else maxNZPerRowBlock = max(maxNZPerRowBlock,((m->rowPtrs)[i+1]-(m->rowPtrs)[i])); } nblocks += (int) ceild(maxNZPerRowBlock,bsy); } (*rowIndices)[iti]=nblocks; *numblocks = nblocks; *indices = (unsigned int *)malloc(sizeof(int)*(nblocks+1)); unsigned int countPerBlock,countInputList=0; nblocks=0; // for each block of row for (it = 0, iti = 0; it < nrows; it += bsx, iti++) { // start of a row block maxNZPerRowBlock=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { lb[i-it] = (m->rowPtrs)[i]; if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1; else ub[i-it] = (m->rowPtrs)[i+1]-1; maxNZPerRowBlock = max(maxNZPerRowBlock,ub[i-it]-lb[i-it]+1); bptr[i-it] = lb[i-it]; } nblocksPerRowBlock = (int) ceild(maxNZPerRowBlock,bsy); // for each block of column within a row block for (unsigned int jt = 0, jti = 0; jt < nblocksPerRowBlock; jt += bsy, jti++,nblocks++) { (*indices)[nblocks]=countInputList; countPerBlock=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { unsigned int j = bptr[i-it]; for (; j <= min(bptr[i-it]+bsy-1,ub[i-it]); j++) { unsigned int cInd = (m->nzentries)[j].colNum; if (i==it) inpListRowBlock[countPerBlock++]=cInd; else { if (!isPresent(inpListRowBlock,countPerBlock,cInd)) inpListRowBlock[countPerBlock++]=cInd; } } bptr[i-it] = j; } countInputList += countPerBlock; } } (*indices)[nblocks]=countInputList; *inputList = (unsigned int *)malloc(sizeof(int)*countInputList); *inputListCount = countInputList; countInputList = 0; nblocks=0; // for each block of row for (it = 0, iti = 0; it < nrows; it += bsx, iti++) { // start of a row block maxNZPerRowBlock=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { lb[i-it] = (m->rowPtrs)[i]; if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1; else ub[i-it] = (m->rowPtrs)[i+1]-1; maxNZPerRowBlock = max(maxNZPerRowBlock,ub[i-it]-lb[i-it]+1); bptr[i-it] = lb[i-it]; } nblocksPerRowBlock = (int) ceild(maxNZPerRowBlock,bsy); // for each block of column within a row block for (unsigned int jt = 0, jti = 0; jt < nblocksPerRowBlock; jt += bsy, jti++,nblocks++) { countPerBlock=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { unsigned int j = bptr[i-it]; for (; j <= min(bptr[i-it]+bsy-1,ub[i-it]); j++) { unsigned int cInd = (m->nzentries)[j].colNum; if (i==it) inpListRowBlock[countPerBlock++]=cInd; else { if (!isPresent(inpListRowBlock,countPerBlock,cInd)) inpListRowBlock[countPerBlock++]=cInd; } } bptr[i-it] = j; } //Sort inpListRowBlock sort(inpListRowBlock,countPerBlock); if ( (countPerBlock > (bsx*bsy/2)) && (inpListRowBlock[countPerBlock-1] - inpListRowBlock[0] <512) ) { for (int k=0;k<countPerBlock;k++) (*inputList)[countInputList+k]=inpListRowBlock[k]; } else { for (int k=0;k<countPerBlock;k++) //(*inputList)[countInputList+k]=inpListRowBlock[k]; (*inputList)[countInputList+k]=ncols; } countInputList += countPerBlock; } } free(inpListRowBlock); free(lb); free(ub); free(bptr); return 0; }
int inspectVarBlock(SpMatrix *m, float **valFill, unsigned int **indicesFill, unsigned int **rowIndicesFill, unsigned int **rowIndices, unsigned int **indices, unsigned int *numblocks, unsigned int **nnzCount_block, unsigned int **yCount_block, unsigned int *nnz_fill, unsigned int bsx, unsigned int bsy, unsigned int varC) { unsigned int nblocks=0, nnz_filled=0; unsigned int nrows = m->numRows; unsigned int ncols = m->numCols; unsigned int *yflag = (unsigned int *)malloc(sizeof(int)*bsy); unsigned int *yBFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy)); unsigned int *lb = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *ub = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *bptr = (unsigned int *)malloc(sizeof(int)*bsx); unsigned int *colFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy)); unsigned int *startCol = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy)); unsigned int *nblocksRow = (unsigned int *)malloc(sizeof(int)*(int)ceild(nrows,bsx)); unsigned int **indRows = (unsigned int **)malloc(sizeof(int*)*(int)ceild(nrows,bsx)); *rowIndices = (unsigned int *)malloc(sizeof(int)*((int)ceild(nrows,bsx)+1)); if (*rowIndices == NULL) return ERR_INSUFFICIENT_MEMORY; // Assume bsy is a multiple of varC // Do blocking as if block size along column is varC // Then combine blocks of size bsy // for each block of row unsigned int it, iti; for (it = 0, iti = 0; it < nrows; it += bsx, iti++) { // start of a row block (*rowIndices)[iti]=nblocks; nblocksRow[iti]=0; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { lb[i-it] = (m->rowPtrs)[i]; if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1; else ub[i-it] = (m->rowPtrs)[i+1]-1; bptr[i-it] = lb[i-it]; } // for each block of column within a row block for (unsigned int jt = 0, jti = 0; jt < ncols; jt += bsy, jti++) { // Assume bsy is a multiple of varC // Skip zero column blocks (of size varC) unsigned int minjt=jt; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { unsigned int j = bptr[i-it]; unsigned int cInd = (m->nzentries)[j].colNum; for (unsigned int jtv=jt; ; jtv+=varC) { if (cInd < jtv+varC) { if (i==it) minjt = jtv; else { if (jtv < minjt) minjt = jtv; } break; } } } jt=minjt; startCol[jti]=jt; colFlag[jti]=0; yBFlag[jti]=0; for (unsigned int k=0;k<bsy;k++) yflag[k]=0; unsigned int blockStart = nblocks; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { unsigned int j = bptr[i-it]; unsigned int rownnz = 0; for (; j <= ub[i-it]; j++) { unsigned cInd = (m->nzentries)[j].colNum; if (cInd >= jt+bsy) { //if ((rownnz%HALFWARP) || (!rownnz)) nnz_filled += (HALFWARP - (rownnz%HALFWARP)); if (rownnz%HALFWARP) nnz_filled += (HALFWARP - (rownnz%HALFWARP)); break; } else { if (!yflag[cInd-jt]) yflag[cInd-jt]++; colFlag[jti]++; //colFlag[jti] stores the #nnzs in the block nnz_filled++; rownnz++; if (blockStart == nblocks) { nblocks++; nblocksRow[iti]++; } } } bptr[i-it] = j; //if ( (j==(ub[i-it]+1)) && ((rownnz%HALFWARP) || (!rownnz)) ) nnz_filled += (HALFWARP - (rownnz%HALFWARP)); if ( (j==(ub[i-it]+1)) && (rownnz%HALFWARP) ) nnz_filled += (HALFWARP - (rownnz%HALFWARP)); } for (unsigned int k=0;k<bsy;k++) yBFlag[jti] += yflag[k]; } indRows[iti] = (unsigned int *)malloc(sizeof(int)*3*nblocksRow[iti]); if (indRows[iti] == NULL) return ERR_INSUFFICIENT_MEMORY; for (unsigned int indRowk=0; indRowk < nblocksRow[iti]; indRowk++) { indRows[iti][3*indRowk]=startCol[indRowk]; indRows[iti][3*indRowk+1]=colFlag[indRowk]; indRows[iti][3*indRowk+2]=yBFlag[indRowk]; } } (*rowIndices)[iti]=nblocks; *numblocks = nblocks; *indices = (unsigned int *)malloc(sizeof(int)*nblocks); if (*indices == NULL) return ERR_INSUFFICIENT_MEMORY; *nnzCount_block = (unsigned int *)malloc(sizeof(int)*nblocks); if (*nnzCount_block == NULL) return ERR_INSUFFICIENT_MEMORY; *yCount_block = (unsigned int *)malloc(sizeof(int)*nblocks); if (*yCount_block == NULL) return ERR_INSUFFICIENT_MEMORY; // Merge all indRows to generate indices, nnzCount_block and yCount_block nblocks=0; for (unsigned int k=0; k < ceild(nrows,bsx); k++) { for (unsigned int l=0; l<nblocksRow[k]; l++) { (*indices)[nblocks]=indRows[k][3*l]; (*nnzCount_block)[nblocks]=indRows[k][3*l+1]; (*yCount_block)[nblocks]=indRows[k][3*l+2]; nblocks++; } } // Fill in value *valFill = (float *)malloc(sizeof(float)*(nnz_filled)); *indicesFill = (unsigned int *)malloc(sizeof(int)*(nnz_filled)); *rowIndicesFill = (unsigned int *)malloc(sizeof(int)*(nrows+1)); // One more loop to fill in val nnz_filled=0; for (it = 0, iti = 0; it < nrows; it += bsx, iti++) { unsigned int lbb = (*rowIndices)[iti]; unsigned int ubb = (*rowIndices)[iti+1]-1; for (unsigned int i = it; i < min(it+bsx,nrows); i++) { (*rowIndicesFill)[i]=nnz_filled; unsigned int lbi, ubi; lbi = (m->rowPtrs)[i]; if (i==(nrows-1)) ubi = m->numNZEntries-1; else ubi = (m->rowPtrs)[i+1]-1; unsigned int j=lbi; unsigned int rownnz=0; unsigned int jti=(*indices)[lbb]; for (unsigned int jb = lbb; jb <= ubb; jb++) { jti = (*indices)[jb]; rownnz=0; for (; j <= ubi; j++) { unsigned int cInd = (m->nzentries)[j].colNum; if (cInd >= (jti+bsy)) { //if ((rownnz%HALFWARP) || (!rownnz)) { if (rownnz%HALFWARP) { for (unsigned int p=0;p<(HALFWARP - (rownnz%HALFWARP));p++) { (*valFill)[nnz_filled]=0; //(*indicesFill)[nnz_filled]=ncols+jti; (*indicesFill)[nnz_filled]=jti; nnz_filled++; } } break; } else { (*valFill)[nnz_filled]=(m->nzentries)[j].val; (*indicesFill)[nnz_filled]=cInd; nnz_filled++; rownnz++; } } } //if ((rownnz%HALFWARP) || (!rownnz)) { if (rownnz%HALFWARP) { for (unsigned int p=0;p<(HALFWARP - (rownnz%HALFWARP));p++) { (*valFill)[nnz_filled]=0; //(*indicesFill)[nnz_filled]=ncols+jti; (*indicesFill)[nnz_filled]=jti; nnz_filled++; } } } } (*rowIndicesFill)[nrows]=nnz_filled; *nnz_fill = nnz_filled; for (unsigned int k=0; k < ceild(nrows,bsx); k++) free(indRows[k]); free(nblocksRow); free(indRows); free(lb); free(ub); free(bptr); free(colFlag); free(startCol); free(yflag); free(yBFlag); return 0; }
double djbi1d_from_pluto(struct args_dimt args, double *jbi_in, double *jbi_out) { int N,T; int c1, c2, c3, c4, c5; int i, j, k, l, t; (void)i; (void)j; (void)k; (void)l; (void)t; register int lb, ub; N = args.width; T = args.iters; double *a, *b; a = jbi_in; b = jbi_out; clock_gettime(CLOCK_MONOTONIC, &tbegin); /* Generated from jacobi-imper.sched.cloog by CLooG v0.14.1 64 bits in 0.01s. */ for (c1=-1;c1<=floord(N+3*T-4,2048);c1++) { lb = max(max(ceild(2048*c1-T+1,2048),ceild(4096*c1-2045,6144)),0); ub = min(min(floord(2048*c1+2047,2048),floord(4096*c1+N+4093,6144)),floord(N+2*T-3,2048)); #pragma omp parallel for shared(c1,lb,ub,a,b) private(c2,c3,c4,c5,i,j,k,l) for (c2=lb;c2<=ub;c2++) { if (c1 >= max(c2,ceild(6144*c2-N+2,4096))) { c3 = 4096*c1-4096*c2 ; for (c4=max(4096*c1-4096*c2+2,2048*c2);c4<=min(4096*c1-4096*c2+N-2,2048*c2+2047);c4++) { c5 = 0 ; if ((c1-c2)%2 == 0) { i = (c1-c2)/2 ; j = -c1+2*c2 ; k = 2048*c1-2048*c2 ; l = -4096*c1+4096*c2+c4 ; S1((c1-c2)/2,-c1+2*c2,2048*c1-2048*c2,-4096*c1+4096*c2+c4) ; } } } if ((c1 <= floord(4096*c2-1,4096)) && (c2 <= floord(N-2,2048))) { c3 = 0 ; for (c4=max(2048*c2,2);c4<=min(2048*c2+2047,N-2);c4++) { c5 = 0 ; if ((c1-c2)%2 == 0) { i = (c1-c2)/2 ; j = -c1+2*c2 ; k = 0 ; l = c4 ; S1((c1-c2)/2,-c1+2*c2,0,c4) ; } } } for (c3=max(max(1,2048*c2-N+2),4096*c1-4096*c2+1);c3<=min(min(4096*c1-4096*c2+4094,2048*c2+2045),2*T-2);c3++) { for (c4=max(2048*c2,c3+2);c4<=min(c3+N-2,2048*c2+2047);c4++) { c5 = 0 ; if ((c1-c2)%2 == 0) { i = (c1-c2)/2 ; j = -c1+2*c2 ; if (c3%2 == 0) { k = c3/2 ; l = -c3+c4 ; S1((c1-c2)/2,-c1+2*c2,c3/2,-c3+c4) ; } } c5 = 1 ; if ((c1-c2)%2 == 0) { i = (c1-c2)/2 ; j = -c1+2*c2 ; if ((c3-1)%2 == 0) { k = (c3-1)/2 ; l = -c3+c4 ; S2((c1-c2)/2,-c1+2*c2,(c3-1)/2,-c3+c4) ; } } } } if (c1 <= min(floord(3072*c2-1025,2048),floord(2048*c2+T-2048,2048))) { c3 = 4096*c1-4096*c2+4095 ; for (c4=max(4096*c1-4096*c2+4097,2048*c2);c4<=min(4096*c1-4096*c2+N+4093,2048*c2+2047);c4++) { c5 = 1 ; if ((c1-c2)%2 == 0) { i = (c1-c2)/2 ; j = -c1+2*c2 ; k = 2048*c1-2048*c2+2047 ; l = -4096*c1+4096*c2+c4-4095 ; S2((c1-c2)/2,-c1+2*c2,2048*c1-2048*c2+2047,-4096*c1+4096*c2+c4-4095) ; } } } if ((c1 >= ceild(4096*c2+2*T-4095,4096)) && (c2 >= ceild(T-1023,1024))) { c3 = 2*T-1 ; for (c4=max(2*T+1,2048*c2);c4<=min(2048*c2+2047,N+2*T-3);c4++) { c5 = 1 ; if ((c1-c2)%2 == 0) { i = (c1-c2)/2 ; j = -c1+2*c2 ; k = T-1 ; l = c4-2*T+1 ; S2((c1-c2)/2,-c1+2*c2,T-1,c4-2*T+1) ; } } } } } clock_gettime(CLOCK_MONOTONIC, &tend); return ELAPSED_TIME_S(tend, tbegin); }
/* Generated from ../../../git/cloog/test/classen2.cloog by CLooG 0.14.0-271-gaa1e292 gmp bits in 0.14s. */ if ((M >= 2) && (N >= 3) && (outerProcTileScatter1 >= outerProcTileScatter2) && (5*outerProcTileScatter1 <= M+2*N-4) && (5*outerProcTileScatter1 <= 5*outerProcTileScatter2+N+2) && (outerProcTileScatter2 >= 0) && (5*outerProcTileScatter2 <= M+N-2) && (outerTimeTileScatter >= outerProcTileScatter1) && (outerTimeTileScatter <= 2*outerProcTileScatter1) && (outerTimeTileScatter <= outerProcTileScatter1+outerProcTileScatter2+1) && (5*outerTimeTileScatter <= 2*M+2*N-6) && (5*outerTimeTileScatter <= 5*outerProcTileScatter1+M+2) && (5*outerTimeTileScatter >= 10*outerProcTileScatter1-2*N-2) && (5*outerTimeTileScatter <= 5*outerProcTileScatter2+M+N) && (5*outerTimeTileScatter >= 10*outerProcTileScatter2-N-3) && (5*outerTimeTileScatter <= 10*outerProcTileScatter2+N+3) && (5*outerTimeTileScatter >= 5*outerProcTileScatter1+5*outerProcTileScatter2-N-4)) { for (compScatter1=max(max(max(max(max(4,5*outerTimeTileScatter),5*outerProcTileScatter2+1),5*outerProcTileScatter1+5*outerProcTileScatter2-N),10*outerProcTileScatter1-2*N+2),10*outerProcTileScatter2-N+1);compScatter1<=min(min(min(min(min(5*outerTimeTileScatter+4,2*M+2*N-6),5*outerProcTileScatter1+M+2),5*outerProcTileScatter1+5*outerProcTileScatter2+5),5*outerProcTileScatter2+M+N),10*outerProcTileScatter2+N+3);compScatter1++) { for (compScatter2=max(max(max(max(ceild(compScatter1+4,2),5*outerProcTileScatter1),5*outerProcTileScatter2+1),compScatter1-M+2),compScatter1-5*outerProcTileScatter2-1);compScatter2<=min(min(min(min(floord(compScatter1+2*N-2,2),compScatter1),5*outerProcTileScatter1+4),compScatter1-5*outerProcTileScatter2+N),5*outerProcTileScatter2+N+2);compScatter2++) { for (compScatter3=max(max(5*outerProcTileScatter2,compScatter1-compScatter2+3),compScatter2-N+2);compScatter3<=min(min(compScatter2-1,5*outerProcTileScatter2+4),compScatter1-compScatter2+N);compScatter3++) { S1(compScatter1-compScatter2+1,-compScatter1+compScatter2+compScatter3-2,compScatter2-compScatter3,compScatter1,compScatter2,compScatter3); } } } }
void test(int n) { /* Scattering iterators. */ int t1, t2, t3; /* Original iterators. */ int i, j, k; if (n >= 1) { t1 = -n+1 ; t2 = n+1 ; for (t3=n+3;t3<=3*n+1;t3++) { if ((t3+n+1)%2 == 0) { k = (t3-n-1)/2 ; S1(1,n,(t3-n-1)/2) ; } } } if ((n >= 2) && (n <= 2)) { t1 = -n+2 ; for (t2=-n+4;t2<=3*n-2;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t2+n)%2 == 0) { i = (t2-n+2)/2 ; j = (t2+n-2)/2 ; if ((t3+n)%2 == 0) { k = (-t2+t3)/2 ; S1((t2-n+2)/2,(t2+n-2)/2,(-t2+t3)/2) ; } } } } t2 = n+3 ; for (t3=1;t3<=n;t3++) { S2(1,n,t3) ; } } if (n >= 3) { t1 = -n+2 ; for (t2=n;t2<=n+2;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t2+n)%2 == 0) { i = (t2-n+2)/2 ; j = (t2+n-2)/2 ; if ((t3+n)%2 == 0) { k = (-t2+t3)/2 ; S1((t2-n+2)/2,(t2+n-2)/2,(-t2+t3)/2) ; } } } } t2 = n+3 ; for (t3=1;t3<=n;t3++) { S2(1,n,t3) ; } } for (t1=ceild(-2*n+5,2);t1<=min(-n+6,-1);t1++) { for (t2=-t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } t2 = t1+2*n+1 ; for (t3=1;t3<=n;t3++) { i = t1+n-1 ; S2(t1+n-1,n,t3) ; } } if (n == 2) { for (t3=5;t3<=7;t3++) { if ((t3+1)%2 == 0) { k = (t3-3)/2 ; S1(2,1,(t3-3)/2) ; } } for (t2=4;t2<=6;t2++) { for (t3=1;t3<=2;t3++) { if (t2%2 == 0) { i = (t2-2)/2 ; j = (t2-2)/2 ; S2((t2-2)/2,(t2-2)/2,t3) ; } } } } for (t1=-n+7;t1<=-1;t1++) { for (t2=-t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=n-2;t2++) { for (t3=1;t3<=t2+1;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } for (t3=n+1;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=n-1;t2<=t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } t2 = t1+2*n+1 ; for (t3=1;t3<=n;t3++) { i = t1+n-1 ; S2(t1+n-1,n,t3) ; } } if (n >= 3) { for (t1=0;t1<=min(1,-n+6);t1++) { for (t2=t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=t1+2*n+1;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } } for (t1=max(-n+7,0);t1<=1;t1++) { for (t2=t1+2;t2<=-t1+4;t2++) { for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+5;t2<=n-2;t2++) { for (t3=1;t3<=t2+1;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } for (t3=n+1;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=n-1;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=t1+2*n+1;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } for (t1=2;t1<=n-5;t1++) { t2 = t1+2 ; for (t3=t1+4;t3<=t1+2*n+2;t3++) { i = t1+1 ; if ((t1+t3)%2 == 0) { k = (-t1+t3-2)/2 ; S1(t1+1,1,(-t1+t3-2)/2) ; } } for (t2=t1+3;t2<=n-2;t2++) { for (t3=1;t3<=t2+1;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } for (t3=n+1;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=n-1;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=-t1+2*n+3;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } for (t1=max(2,n-4);t1<=floord(2*n-3,2);t1++) { t2 = t1+2 ; for (t3=t1+4;t3<=t1+2*n+2;t3++) { i = t1+1 ; if ((t1+t3)%2 == 0) { k = (-t1+t3-2)/2 ; S1(t1+1,1,(-t1+t3-2)/2) ; } } for (t2=t1+3;t2<=-t1+2*n;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } for (t3=t2+2;t3<=t2+2*n;t3++) { if ((t1+t2)%2 == 0) { i = (t1+t2)/2 ; j = (-t1+t2)/2 ; if ((t1+t3)%2 == 0) { k = (-t2+t3)/2 ; S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ; } } } } for (t2=-t1+2*n+1;t2<=-t1+2*n+3;t2++) { for (t3=1;t3<=n;t3++) { if ((t1+t2+1)%2 == 0) { i = (t1+t2-3)/2 ; j = (-t1+t2-1)/2 ; S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ; } } } } if (n >= 3) { t1 = n-1 ; t2 = n+1 ; for (t3=n+3;t3<=3*n+1;t3++) { if ((t3+n+1)%2 == 0) { k = (t3-n-1)/2 ; S1(n,1,(t3-n-1)/2) ; } } for (t2=n+2;t2<=n+4;t2++) { for (t3=1;t3<=n;t3++) { if ((t2+n)%2 == 0) { i = (t2+n-4)/2 ; j = (t2-n)/2 ; S2((t2+n-4)/2,(t2-n)/2,t3) ; } } } } if (n >= 1) { t2 = n+3 ; for (t3=1;t3<=n;t3++) { S2(n,1,t3) ; } } }
int main() { init_arrays(); double annot_t_start=0, annot_t_end=0, annot_t_total=0; int annot_i; for (annot_i=0; annot_i<REPS; annot_i++) { annot_t_start = rtclock(); /*@ begin PerfTuning ( def build { arg build_command = 'icc -O3 -openmp -I/usr/local/icc/include -lm'; } def performance_counter { arg repetitions = 1; } def performance_params { # param T1_1[] = [1,16,32,64,128]; # param T1_2[] = [1,16,32,64,128]; # param T1_3[] = [1,16,32,64,128]; # param T2_1[] = [1,4,8,16,32]; # param T2_2[] = [1,4,8,16,32]; # param T2_3[] = [1,4,8,16,32]; param T1_1[] = [64]; param T1_2[] = [256]; param T1_3[] = [64]; param T2_1[] = [1]; param T2_2[] = [1]; param T2_3[] = [1]; constraint c1 = (T1_1*T2_1<=1024 and T1_1*T2_1<=1024 and T1_1*T2_1<=1024); constraint c2 = ((T1_1 == T1_3) and (T2_1 == T2_3)); param U1[] = [1]; param U2[] = [1]; param U3[] = [7]; constraint c3 = (U1*U2*U3<=512); param PERM[] = [ #[0,1,2], #[0,2,1], #[1,0,2], #[1,2,0], [2,0,1], #[2,1,0], ]; param PAR[] = [True]; param SCREP[] = [False]; param IVEC[] = [True]; } def search { arg algorithm = 'Exhaustive'; # arg algorithm = 'Simplex'; # arg time_limit = 5; # arg total_runs = 1; } def input_params { param N[] = [1024]; } def input_vars { arg decl_file = 'decl_code.h'; arg init_file = 'init_code.c'; } ) @*/ /**-- (Generated by Orio) Best performance cost: 0.201184 Tuned for specific problem sizes: N = 1024 Best performance parameters: IVEC = True PAR = True PERM = [2, 0, 1] SCREP = False T1_1 = 64 T1_2 = 256 T1_3 = 64 T2_1 = 1 T2_2 = 1 T2_3 = 1 U1 = 1 U2 = 1 U3 = 7 --**/ register int i,j,k; register int c1t, c2t, c3t, c4t, c5t, c6t, c7t, c8t, c9t, c10t, c11t, c12t; register int newlb_c1, newlb_c2, newlb_c3, newlb_c4, newlb_c5, newlb_c6, newlb_c7, newlb_c8, newlb_c9, newlb_c10, newlb_c11, newlb_c12; register int newub_c1, newub_c2, newub_c3, newub_c4, newub_c5, newub_c6, newub_c7, newub_c8, newub_c9, newub_c10, newub_c11, newub_c12; /*@ begin PolySyn( parallel = PAR; tiles = [T1_1,T1_2,T1_3,T2_1,T2_2,T2_3]; permut = PERM; unroll_factors = [U1,U2,U3]; scalar_replace = SCREP; vectorize = IVEC; profiling_code = 'lu_profiling.c'; compile_cmd = 'gcc'; compile_opts = '-lm'; ) @*/ #include <math.h> #include <assert.h> #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) int c1, c2, c3, c4, c5, c6, c7, c8, c9; register int lb, ub, lb1, ub1, lb2, ub2; /* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 0.05s. */ for (c1=-1;c1<=floord(5*N-9,256);c1++) { lb1=max(max(ceild(32*c1-127,160),ceild(64*c1-N+2,64)),0); ub1=min(floord(64*c1+63,64),floord(N-1,256)); #pragma omp parallel for shared(c1,lb1,ub1) private(c2,c3,c4,c5,c6,c7,c8,c9) for (c2=lb1; c2<=ub1; c2++) { for (c3=max(ceild(32*c1-32*c2-1953,2016),ceild(32*c1-32*c2-31,32));c3<=floord(N-1,64);c3++) { if (c1 == c2+c3) { for (c7=max(64*c3,0);c7<=min(min(N-2,64*c3+62),256*c2+254);c7++) { for (c8=max(c7+1,256*c2);c8<=min(N-1,256*c2+255);c8++) { A[c7][c8]=A[c7][c8]/A[c7][c7] ; for (c9=c7+1;c9<=min(N-1,64*c3+63);c9++) { A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8] ; } } } } /*@ begin Loop( transform Composite( permut = [['c9', 'c7', 'c8']], regtile = (['c7', 'c8', 'c9'],[1, 1, 7]), scalarreplace = (False, 'double'), vector = (True, ['ivdep','vector always'])) for (c7=max(0,64*c1-64*c2);c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1);c7++) { for (c8=max(c7+1,256*c2);c8<=min(256*c2+255,N-1);c8++) { for (c9=64*c3;c9<=min(N-1,64*c3+63);c9++) { A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8] ; } } } ) @*/{ for (c9t=64*c3; c9t<=min(N-1,64*c3+63)-6; c9t=c9t+7) { for (c7=max(0,64*c1-64*c2); c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1); c7++ ) { register int cbv_1, cbv_2; cbv_1=max(c7+1,256*c2); cbv_2=min(256*c2+255,N-1); #pragma ivdep #pragma vector always for (c8=cbv_1; c8<=cbv_2; c8++ ) { A[c9t][c8]=A[c9t][c8]-A[c9t][c7]*A[c7][c8]; A[(c9t+1)][c8]=A[(c9t+1)][c8]-A[(c9t+1)][c7]*A[c7][c8]; A[(c9t+2)][c8]=A[(c9t+2)][c8]-A[(c9t+2)][c7]*A[c7][c8]; A[(c9t+3)][c8]=A[(c9t+3)][c8]-A[(c9t+3)][c7]*A[c7][c8]; A[(c9t+4)][c8]=A[(c9t+4)][c8]-A[(c9t+4)][c7]*A[c7][c8]; A[(c9t+5)][c8]=A[(c9t+5)][c8]-A[(c9t+5)][c7]*A[c7][c8]; A[(c9t+6)][c8]=A[(c9t+6)][c8]-A[(c9t+6)][c7]*A[c7][c8]; } } } for (c9=c9t; c9<=min(N-1,64*c3+63); c9=c9+1) { for (c7=max(0,64*c1-64*c2); c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1); c7++ ) { register int cbv_3, cbv_4; cbv_3=max(c7+1,256*c2); cbv_4=min(256*c2+255,N-1); #pragma ivdep #pragma vector always for (c8=cbv_3; c8<=cbv_4; c8++ ) { A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8]; } } } } /*@ end @*/ if ((-c1 == -c2-c3) && (c1 <= min(floord(320*c2+191,64),floord(64*c2+N-65,64)))) { for (c8=max(256*c2,64*c1-64*c2+64);c8<=min(256*c2+255,N-1);c8++) { A[64*c1-64*c2+63][c8]=A[64*c1-64*c2+63][c8]/A[64*c1-64*c2+63][64*c1-64*c2+63] ; } } } } } /* End of CLooG code */ /*@ end @*/ /*@ end @*/ annot_t_end = rtclock(); annot_t_total += annot_t_end - annot_t_start; } annot_t_total = annot_t_total / REPS; #ifndef TEST printf("%f\n", annot_t_total); #else { int i, j; for (i=0; i<N; i++) { for (j=0; j<N; j++) { if (j%100==0) printf("\n"); printf("%f ",A[i][j]); } printf("\n"); } } #endif return ((int) A[0][0]); }