Exemplo n.º 1
0
void test(int M)
{
  /* Scattering iterators. */
  int c1, c2, c3, c4;
  /* Original iterators. */
  int i, j, k, l;
  if (M >= 3) {
    for (c1=-1;c1<=min(2,floord(M+2,4));c1++) {
      for (c2=max(ceild(2*c1-M+1,4),ceild(4*c1-M-2,4));c2<=min(0,floord(c1,2));c2++) {
        for (c3=max(max(-4*c2-2,4*c2+3),4*c1-4*c2+1);c3<=min(min(min(M+3,-4*c2+9),4*c2+2*M),4*c1-4*c2+4);c3++) {
          for (c4=max(3*c3-4*floord(c3+M+1,2)+6,4*c2-c3-4*floord(-c3+1,4)+2);c4<=min(min(4*c2+4,-c3+10),c3-2);c4+=4) {
            if ((c2 <= floord(c4-1,4)) && (c2 >= ceild(c4-4,4))) {
              S1(c1-c2,c2,(c3+c4-2)/4,(c3-c4)/2);
            }
          }
        }
      }
    }
  }
}
Exemplo n.º 2
0
void test(int n)
{
  /* Scattering iterators. */
  int c1, c2;
  /* Original iterators. */
  int i, j;
  for (c1=0;c1<=5*n;c1++) {
    for (c2=max(c1-n,ceild(2*c1,3));c2<=min(c1,floord(2*c1+2*n,3));c2++) {
      if (c2%2 == 0) {
        i = (-2*c1+3*c2)/2 ;
        j = c1-c2 ;
        S1((-2*c1+3*c2)/2,c1-c2) ;
      }
    }
  }
}
Exemplo n.º 3
0
void test(int outerTimeTileScatter, int outerProcTileScatter1, int outerProcTileScatter2, int M, int N)
{
  /* Scattering iterators. */
  int compScatter1, compScatter2, compScatter3;
  /* Original iterators. */
  int compIter1, compIter2, compIter3, compIter4, compIter5, compIter6;
  if ((M >= 2) && (N >= 3) && (outerProcTileScatter1 >= outerProcTileScatter2) && (5*outerProcTileScatter1 <= M+2*N-4) && (5*outerProcTileScatter1 <= 5*outerProcTileScatter2+N+2) && (outerProcTileScatter2 >= 0) && (5*outerProcTileScatter2 <= M+N-2) && (outerTimeTileScatter >= outerProcTileScatter1) && (outerTimeTileScatter <= 2*outerProcTileScatter1) && (outerTimeTileScatter <= outerProcTileScatter1+outerProcTileScatter2+1) && (5*outerTimeTileScatter <= 2*M+2*N-6) && (5*outerTimeTileScatter <= 5*outerProcTileScatter1+M+2) && (5*outerTimeTileScatter >= 10*outerProcTileScatter1-2*N-2) && (5*outerTimeTileScatter <= 5*outerProcTileScatter2+M+N) && (5*outerTimeTileScatter >= 10*outerProcTileScatter2-N-3) && (5*outerTimeTileScatter <= 10*outerProcTileScatter2+N+3) && (5*outerTimeTileScatter >= 5*outerProcTileScatter1+5*outerProcTileScatter2-N-4)) {
    for (compScatter1=max(max(max(max(max(4,5*outerTimeTileScatter),5*outerProcTileScatter2+1),5*outerProcTileScatter1+5*outerProcTileScatter2-N),10*outerProcTileScatter1-2*N+2),10*outerProcTileScatter2-N+1);compScatter1<=min(min(min(min(min(5*outerTimeTileScatter+4,2*M+2*N-6),5*outerProcTileScatter1+M+2),5*outerProcTileScatter1+5*outerProcTileScatter2+5),5*outerProcTileScatter2+M+N),10*outerProcTileScatter2+N+3);compScatter1++) {
      for (compScatter2=max(max(max(max(ceild(compScatter1+4,2),5*outerProcTileScatter1),5*outerProcTileScatter2+1),compScatter1-M+2),compScatter1-5*outerProcTileScatter2-1);compScatter2<=min(min(min(min(floord(compScatter1+2*N-2,2),compScatter1),5*outerProcTileScatter1+4),compScatter1-5*outerProcTileScatter2+N),5*outerProcTileScatter2+N+2);compScatter2++) {
        for (compScatter3=max(max(5*outerProcTileScatter2,compScatter1-compScatter2+3),compScatter2-N+2);compScatter3<=min(min(compScatter2-1,5*outerProcTileScatter2+4),compScatter1-compScatter2+N);compScatter3++) {
          S1(compScatter1-compScatter2+1,-compScatter1+compScatter2+compScatter3-2,compScatter2-compScatter3,compScatter1,compScatter2,compScatter3);
        }
      }
    }
  }
}
Exemplo n.º 4
0
int main(void) {
  int t = 0, z, y, x, k;
  double total_lattice_pts =
      (double)nZ * (double)nY * (double)nX * (double)nTimesteps;

  /* For timekeeping */
  int ts_return = -1;
  struct timeval start, end, result;
  double tdiff = 0.0;

  /* Compute values for global parameters */
  omega = 2.0 /
          ((6.0 * sqrt(uTopX * uTopX + uTopY * uTopY) * (nX - 0.5) / re) + 1.0);

  printf(
      "3D Lid Driven Cavity simulation with D3Q19 lattice:\n"
      "\tscheme     : 3-Grid, Fused, Pull\n"
      "\tgrid size  : %d x %d x %d = %.2lf * 10^3 Cells\n"
      "\tnTimeSteps : %d\n"
      "\tRe         : %.2lf\n"
      "\tuTopX      : %.6lf\n"
      "\tuTopY      : %.6lf\n"
      "\tomega      : %.6lf\n",
      nX, nY, nZ, nX * nY * nZ / 1.0e3, nTimesteps, re, uTopX, uTopY, omega);

  /* Initialize all 19 PDFs for each point in the domain to w1, w2 or w3
   * accordingly */
  for (z = 0; z < nZ + 2 + 4; z++) {
    for (y = 0; y < nY + 2 + 2; y++) {
      for (x = 0; x < nX + 2 + 2; x++) {
        grid[0][z][y][x][0] = w1;
        grid[1][z][y][x][0] = w1;

        for (k = 1; k < 7; k++) {
          grid[0][z][y][x][k] = w2;
          grid[1][z][y][x][k] = w2;
        }

        for (k = 7; k < nK; k++) {
          grid[0][z][y][x][k] = w3;
          grid[1][z][y][x][k] = w3;
        }
      }
    }
  }

  /* To satisfy PET */
  short _nX = nX + 3;
  short _nY = nY + 3;
  short _nZ = nZ + 4;
  short _nTimesteps = nTimesteps;

#ifdef TIME
  gettimeofday(&start, 0);
#endif

  int t1, t2, t3, t4, t5, t6, t7, t8;
  int lb, ub, lbp, ubp, lb2, ub2;
  register int lbv, ubv;
  /* Start of CLooG code */
  if ((_nTimesteps >= 1) && (_nX >= 2) && (_nY >= 2) && (_nZ >= 3)) {
    for (t1 = -1; t1 <= floord(_nTimesteps - 1, 8); t1++) {
      lbp = max(ceild(t1, 2), ceild(16 * t1 - _nTimesteps + 3, 16));
      ubp =
          min(floord(_nTimesteps + _nZ - 2, 16), floord(8 * t1 + _nZ + 6, 16));
#pragma omp parallel for private(lbv, ubv, t3, t4, t5, t6, t7, t8)
      for (t2 = lbp; t2 <= ubp; t2++) {
        for (t3 = max(max(0, ceild(t1 - 1, 2)), ceild(16 * t2 - _nZ - 13, 16));
             t3 <= min(min(min(floord(_nTimesteps + _nY - 2, 16),
                               floord(8 * t1 + _nY + 14, 16)),
                           floord(16 * t2 + _nY + 12, 16)),
                       floord(16 * t1 - 16 * t2 + _nZ + _nY + 13, 16));
             t3++) {
          for (t4 = max(
                   max(max(0, ceild(t1 - 1, 2)), ceild(16 * t2 - _nZ - 13, 16)),
                   ceild(16 * t3 - _nY - 13, 16));
               t4 <= min(min(min(min(floord(_nTimesteps + _nX - 2, 16),
                                     floord(8 * t1 + _nX + 14, 16)),
                                 floord(16 * t2 + _nX + 12, 16)),
                             floord(16 * t3 + _nX + 13, 16)),
                         floord(16 * t1 - 16 * t2 + _nZ + _nX + 13, 16));
               t4++) {
            for (t5 = max(max(max(max(max(0, 8 * t1), 16 * t1 - 16 * t2 + 2),
                                  16 * t2 - _nZ + 1),
                              16 * t3 - _nY + 1),
                          16 * t4 - _nX + 1);
                 t5 <= min(min(min(min(min(_nTimesteps - 1, 8 * t1 + 15),
                                       16 * t2 + 13),
                                   16 * t3 + 14),
                               16 * t4 + 14),
                           16 * t1 - 16 * t2 + _nZ + 14);
                 t5++) {

              /* Hoisted loop conditional */
              if (t5 % 2 == 0) {
                for (t6 = max(max(16 * t2, t5 + 2),
                              -16 * t1 + 16 * t2 + 2 * t5 - 15);
                     t6 <= min(min(16 * t2 + 15, -16 * t1 + 16 * t2 + 2 * t5),
                               t5 + _nZ - 1);
                     t6++) {
                  for (t7 = max(16 * t3, t5 + 1);
                       t7 <= min(16 * t3 + 15, t5 + _nY - 1); t7++) {
                    lbv = max(16 * t4, t5 + 1);
                    ubv = min(16 * t4 + 15, t5 + _nX - 1);

#pragma ivdep
#pragma vector always
                    for (t8 = lbv; t8 <= ubv; t8++) {
                      lbm_kernel(
                          grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0],
                          grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) - 1][1],
                          grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) + 1][2],
                          grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8)][3],
                          grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8)][4],
                          grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8)][5],
                          grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8)][6],
                          grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) - 1]
                              [7],
                          grid[0][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) + 1]
                              [8],
                          grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) - 1]
                              [9],
                          grid[0][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) + 1]
                              [10],
                          grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) - 1]
                              [11],
                          grid[0][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) + 1]
                              [12],
                          grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) - 1]
                              [13],
                          grid[0][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) + 1]
                              [14],
                          grid[0][(-t5 + t6) - 1][(-t5 + t7) - 1][(-t5 + t8)]
                              [15],
                          grid[0][(-t5 + t6) - 1][(-t5 + t7) + 1][(-t5 + t8)]
                              [16],
                          grid[0][(-t5 + t6) + 1][(-t5 + t7) - 1][(-t5 + t8)]
                              [17],
                          grid[0][(-t5 + t6) + 1][(-t5 + t7) + 1][(-t5 + t8)]
                              [18],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][1],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][2],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][3],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][4],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][5],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][6],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][7],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][8],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][9],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][10],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][11],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][12],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][13],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][14],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][15],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][16],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][17],
                          &grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][18],
                          (t5), ((-t5 + t6)), ((-t5 + t7)), ((-t5 + t8)));
                      ;
                    }
                  }
                }
              } else {
                for (t6 = max(max(16 * t2, t5 + 2),
                              -16 * t1 + 16 * t2 + 2 * t5 - 15);
                     t6 <= min(min(16 * t2 + 15, -16 * t1 + 16 * t2 + 2 * t5),
                               t5 + _nZ - 1);
                     t6++) {
                  for (t7 = max(16 * t3, t5 + 1);
                       t7 <= min(16 * t3 + 15, t5 + _nY - 1); t7++) {
                    lbv = max(16 * t4, t5 + 1);
                    ubv = min(16 * t4 + 15, t5 + _nX - 1);

#pragma ivdep
#pragma vector always
                    for (t8 = lbv; t8 <= ubv; t8++) {
                      lbm_kernel(
                          grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0],
                          grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) - 1][1],
                          grid[1][(-t5 + t6)][(-t5 + t7)][(-t5 + t8) + 1][2],
                          grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8)][3],
                          grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8)][4],
                          grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8)][5],
                          grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8)][6],
                          grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) - 1]
                              [7],
                          grid[1][(-t5 + t6)][(-t5 + t7) - 1][(-t5 + t8) + 1]
                              [8],
                          grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) - 1]
                              [9],
                          grid[1][(-t5 + t6)][(-t5 + t7) + 1][(-t5 + t8) + 1]
                              [10],
                          grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) - 1]
                              [11],
                          grid[1][(-t5 + t6) - 1][(-t5 + t7)][(-t5 + t8) + 1]
                              [12],
                          grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) - 1]
                              [13],
                          grid[1][(-t5 + t6) + 1][(-t5 + t7)][(-t5 + t8) + 1]
                              [14],
                          grid[1][(-t5 + t6) - 1][(-t5 + t7) - 1][(-t5 + t8)]
                              [15],
                          grid[1][(-t5 + t6) - 1][(-t5 + t7) + 1][(-t5 + t8)]
                              [16],
                          grid[1][(-t5 + t6) + 1][(-t5 + t7) - 1][(-t5 + t8)]
                              [17],
                          grid[1][(-t5 + t6) + 1][(-t5 + t7) + 1][(-t5 + t8)]
                              [18],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][0],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][1],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][2],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][3],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][4],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][5],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][6],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][7],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][8],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][9],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][10],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][11],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][12],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][13],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][14],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][15],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][16],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][17],
                          &grid[0][(-t5 + t6)][(-t5 + t7)][(-t5 + t8)][18],
                          (t5), ((-t5 + t6)), ((-t5 + t7)), ((-t5 + t8)));
                      ;
                    }
                  }
                }
              }
              /* end hoisted if */
            }
          }
        }
      }
    }
  }
/* End of CLooG code */

#ifdef TIME
  gettimeofday(&end, 0);

  ts_return = timeval_subtract(&result, &end, &start);
  tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6);

  printf("\tTime taken : %7.5lfm\n", tdiff / 60.0);
  printf("\tMLUPS      : %7.5lf\n", (total_lattice_pts / (1.0e6 * tdiff)));
#endif

#ifdef DEBUG
  /* Dump rho, uX, uY for the entire domain to verify results */
  dumpVelocities(t);
#endif

  return 0;
}
Exemplo n.º 5
0
int main(void) {
  int t, y, x, k;
  double total_lattice_pts = (double)nY * (double)nX * (double)nTimesteps;

  /* For timekeeping */
  int ts_return = -1;
  struct timeval start, end, result;
  double tdiff = 0.0;

  /* Compute values for global parameters */
  omega = 1.0 / tau;
  circle_R2 = circle_radius * circle_radius;

  double rho_avg = (rho_in + rho_out) / 2.0;

  printf(
      "2D Flow Past Cylinder simulation with D2Q9 lattice:\n"
      "\tscheme     : 2-Grid, Fused, Pull\n"
      "\tgrid size  : %d x %d = %.2lf * 10^3 Cells\n"
      "\tnTimeSteps : %d\n"
      "\tomega      : %.6lf\n",
      nX, nY, nX * nY / 1.0e3, nTimesteps, omega);

  /* Initialize all 9 PDFs for each point in the domain to 1.0 */
  for (y = 0; y < nY + 2 + 4; y++) {
    for (x = 0; x < nX + 2 + 2; x++) {
      grid[0][y][x][0] = w1 * rho_avg;
      grid[1][y][x][0] = w1 * rho_avg;

      for (k = 1; k < 5; k++) {
        grid[0][y][x][k] = w2 * rho_avg;
        grid[1][y][x][k] = w2 * rho_avg;
      }

      for (k = 5; k < nK; k++) {
        grid[0][y][x][k] = w3 * rho_avg;
        grid[1][y][x][k] = w3 * rho_avg;
      }
    }
  }

  /* To satisfy PET */
  short _nX = nX + 2;
  short _nY = nY + 3;
  int _nTimesteps = nTimesteps;

#ifdef TIME
  gettimeofday(&start, 0);
#endif

  int t1, t2, t3, t4, t5, t6;
  int lb, ub, lbp, ubp, lb2, ub2;
  register int lbv, ubv;
  /* Start of CLooG code */
  if ((_nTimesteps >= 1) && (_nX >= 3) && (_nY >= 4)) {
    for (t1 = -1; t1 <= floord(5 * _nTimesteps + 3 * _nY - 8, 32); t1++) {
      lbp = max(max(ceild(4 * t1, 5), ceild(16 * t1 - _nTimesteps + 1, 12)),
                ceild(32 * t1 - _nTimesteps + 4, 32));
      ubp = min(min(floord(4 * t1 + 4, 3), floord(_nTimesteps + _nY - 2, 8)),
                floord(16 * t1 + _nY + 14, 20));
#pragma omp parallel for private(lbv, ubv, t3, t4, t5, t6)
      for (t2 = lbp; t2 <= ubp; t2++) {
        for (t3 = max(max(0, ceild(4 * t1 - 3 * t2 - 1, 2)),
                      ceild(8 * t2 - _nY - 4, 8));
             t3 <= min(min(min(floord(_nTimesteps + _nX - 2, 8),
                               floord(8 * t2 + _nX + 3, 8)),
                           floord(16 * t1 - 12 * t2 + _nX + 18, 8)),
                       floord(32 * t1 - 32 * t2 + _nY + _nX + 29, 8));
             t3++) {
          for (t4 = max(
                   max(max(max(0, 16 * t1 - 12 * t2), 32 * t1 - 32 * t2 + 3),
                       8 * t2 - _nY + 1),
                   8 * t3 - _nX + 1);
               t4 <= min(min(min(min(_nTimesteps - 1, 8 * t2 + 4), 8 * t3 + 5),
                             16 * t1 - 12 * t2 + 19),
                         32 * t1 - 32 * t2 + _nY + 30);
               t4++) {

            /* Hoisted loop conditional */
            if (t4 % 2 == 0) {
              for (t5 = max(max(8 * t2, t4 + 3),
                            -32 * t1 + 32 * t2 + 2 * t4 - 31);
                   t5 <= min(min(8 * t2 + 7, -32 * t1 + 32 * t2 + 2 * t4),
                             t4 + _nY - 1);
                   t5++) {
                lbv = max(8 * t3, t4 + 2);
                ubv = min(8 * t3 + 7, t4 + _nX - 1);

#pragma ivdep
#pragma vector always
                for (t6 = lbv; t6 <= ubv; t6++) {
                  lbm_kernel(grid[0][(-t4 + t5)][(-t4 + t6)][0],
                             grid[0][(-t4 + t5) - 1][(-t4 + t6)][3],
                             grid[0][(-t4 + t5) + 1][(-t4 + t6)][4],
                             grid[0][(-t4 + t5)][(-t4 + t6) - 1][1],
                             grid[0][(-t4 + t5)][(-t4 + t6) + 1][2],
                             grid[0][(-t4 + t5) - 1][(-t4 + t6) - 1][5],
                             grid[0][(-t4 + t5) - 1][(-t4 + t6) + 1][6],
                             grid[0][(-t4 + t5) + 1][(-t4 + t6) - 1][7],
                             grid[0][(-t4 + t5) + 1][(-t4 + t6) + 1][8],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][0],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][3],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][4],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][1],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][2],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][5],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][6],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][7],
                             &grid[1][(-t4 + t5)][(-t4 + t6)][8], (t4),
                             ((-t4 + t5)), ((-t4 + t6)));
                  ;
                }
              }
            } else {
              for (t5 = max(max(8 * t2, t4 + 3),
                            -32 * t1 + 32 * t2 + 2 * t4 - 31);
                   t5 <= min(min(8 * t2 + 7, -32 * t1 + 32 * t2 + 2 * t4),
                             t4 + _nY - 1);
                   t5++) {
                lbv = max(8 * t3, t4 + 2);
                ubv = min(8 * t3 + 7, t4 + _nX - 1);

#pragma ivdep
#pragma vector always
                for (t6 = lbv; t6 <= ubv; t6++) {
                  lbm_kernel(grid[1][(-t4 + t5)][(-t4 + t6)][0],
                             grid[1][(-t4 + t5) - 1][(-t4 + t6)][3],
                             grid[1][(-t4 + t5) + 1][(-t4 + t6)][4],
                             grid[1][(-t4 + t5)][(-t4 + t6) - 1][1],
                             grid[1][(-t4 + t5)][(-t4 + t6) + 1][2],
                             grid[1][(-t4 + t5) - 1][(-t4 + t6) - 1][5],
                             grid[1][(-t4 + t5) - 1][(-t4 + t6) + 1][6],
                             grid[1][(-t4 + t5) + 1][(-t4 + t6) - 1][7],
                             grid[1][(-t4 + t5) + 1][(-t4 + t6) + 1][8],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][0],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][3],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][4],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][1],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][2],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][5],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][6],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][7],
                             &grid[0][(-t4 + t5)][(-t4 + t6)][8], (t4),
                             ((-t4 + t5)), ((-t4 + t6)));
                  ;
                }
              }
            }
            /* end hoisted if */
          }
        }
      }
    }
  }
/* End of CLooG code */

#ifdef TIME
  gettimeofday(&end, 0);

  ts_return = timeval_subtract(&result, &end, &start);
  tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6);

  printf("\tTime taken : %7.5lfs\n", tdiff);
  printf("\tMLUPS      : %7.5lf\n", (total_lattice_pts / (1.0e6 * tdiff)));
#endif

#ifdef DEBUG
  /* Dump rho, uX, uY for the entire domain to verify results */
  dumpVelocities(t);
#endif

  return 0;
}
Exemplo n.º 6
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  for (annot_i=0; annot_i<REPS; annot_i++)
  {
    annot_t_start = rtclock();
    



#include <math.h>
#include <assert.h>

#define ceild(n,d)  ceil(((double)(n))/((double)(d)))
#define floord(n,d) floor(((double)(n))/((double)(d)))
#define max(x,y)    ((x) > (y)? (x) : (y))
#define min(x,y)    ((x) < (y)? (x) : (y))

	#define S1(zT0,zT1,zT2,zT3,zT4,zT5,t,i,j)	{A[i][j]=(A[1+i][1+j]+A[1+i][j]+A[1+i][j-1]+A[i][1+j]+A[i][j]+A[i][j-1]+A[i-1][1+j]+A[i-1][j]+A[i-1][j-1])/9;}

	int c1, c2, c3, c4, c5, c6, c7, c8, c9;

	register int lb, ub, lb1, ub1, lb2, ub2;
	register int lbv, ubv;

	omp_set_nested(1);
	omp_set_num_threads(2);
/* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 5.45s. */
for (c1=-2;c1<=floord(4*T+3*N-10,256);c1++) {
	lb1=max(max(max(0,ceild(256*c1-2*T-N-251,512)),ceild(256*c1-3*T-2*N+7,256)),ceild(256*c1-N-761,1024));
	ub1=min(min(min(floord(256*c1+2*N+505,1024),floord(256*c1+509,512)),floord(64*c1+127,64)),floord(T+N-3,256));
#pragma omp parallel for shared(c1,lb1,ub1) private(lb2,ub2,c2,c3,c4,c5,c6,c7,c8,c9)
	for (c2=lb1; c2<=ub1; c2++) {
	lb2=max(max(max(max(max(max(ceild(256*c1-256*c2-T+1,256),ceild(512*c1-512*c2-253,768)),0),ceild(512*c2-N-252,256)),ceild(128*c1-256*c2-127,128)),ceild(128*c2-127,128)),ceild(128*c1-127,256));
	ub2=min(min(min(min(min(min(floord(256*c1-256*c2+255,256),floord(256*c1-512*c2+N+253,256)),floord(256*c2+T+N+252,256)),floord(T+N-3,128)),floord(256*c1+N+508,512)),floord(256*c1-256*c2+N+253,384)),floord(512*c2+N+507,256));
#pragma omp parallel for shared(c1,c2,lb1,ub1,lb2,ub2) private(c3,c4,c5,c6,c7,c8,c9)
	for (c3=lb2; c3<=ub2; c3++) {
      for (c4=max(max(max(max(0,ceild(-256*c2+256*c3-N-284,32)),8*c1-8*c2-8*c3),ceild(256*c2-N-29,32)),ceild(128*c3-N-29,32));c4<=min(min(min(min(8*c1-8*c2-8*c3+7,floord(256*c3+253,64)),floord(T-1,32)),floord(128*c2+127,16)),floord(-128*c2+128*c3+127,16));c4++) {
        for (c5=max(max(max(max(max(8*c2,ceild(16*c4-15,16)),ceild(256*c3-T-N-28,32)),0),ceild(256*c3-32*c4-N-60,32)),ceild(256*c3-N-59,64));c5<=min(min(min(min(min(floord(32*c4+N+29,32),floord(128*c3+127,16)),8*c2+7),floord(128*c3-16*c4+127,16)),floord(T+N-3,32)),floord(256*c3+N+252,64));c5++) {
          for (c6=max(max(max(max(max(ceild(64*c4-29,32),8*c3),ceild(16*c5-15,16)),ceild(16*c4+16*c5-15,16)),0),ceild(64*c5-N-28,32));c6<=min(min(min(min(min(8*c3+7,floord(T+N-3,16)),floord(32*c4+32*c5+N+60,32)),floord(32*c4+N+29,16)),floord(64*c5+N+59,32)),floord(32*c5+T+N+28,32));c6++) {
            for (c7=max(max(max(max(0,32*c4),32*c5-N+2),16*c6-N+2),-32*c5+32*c6-N-29);c7<=min(min(min(min(-32*c5+32*c6+30,floord(32*c6+29,2)),T-1),32*c5+30),32*c4+31);c7++) {
/*@ begin Loop(
transform UnrollJam(ufactor=8)
              for (c8=max(max(32*c5,c7+1),32*c6-c7-N+2);c8<=min(min(32*c6-c7+30,32*c5+31),c7+N-2);c8++) 
transform Unroll(ufactor=8)
                for (c9=max(c7+c8+1,32*c6);c9<=min(32*c6+31,c7+c8+N-2);c9++) 
{
                  S1(c1-c2-c3,-c1+2*c2+c3,-c1+2*c3,c4,-c4+c5,-c4-c5+c6,c7,-c7+c8,-c7-c8+c9) ;
}
) @*/
              for (c8=max(max(32*c5,c7+1),32*c6-c7-N+2);c8<=min(min(32*c6-c7+30,32*c5+31),c7+N-2);c8++) {
                for (c9=max(c7+c8+1,32*c6);c9<=min(32*c6+31,c7+c8+N-2);c9++) {
                  S1(c1-c2-c3,-c1+2*c2+c3,-c1+2*c3,c4,-c4+c5,-c4-c5+c6,c7,-c7+c8,-c7-c8+c9) ;
                }
              }
/*@ end @*/
            }
          }
        }
      }
    }
  }
}
/* End of CLooG code */


    annot_t_end = rtclock();
    annot_t_total += annot_t_end - annot_t_start;
  }
  
  annot_t_total = annot_t_total / REPS;
  printf("%f\n", annot_t_total);
  
  return ((int) A[0][0]); 

}
Exemplo n.º 7
0
/* Generated from ./non_optimal/nul_complex1.cloog by CLooG 0.18.1-2-g43fc508 gmp bits in 0.00s. */
if (n >= 0) {
  for (c1=0;c1<=5*n;c1++) {
    for (c2=max(ceild(2*c1,3),c1-n);c2<=min(floord(2*c1+2*n,3),c1);c2++) {
      if (c2%2 == 0) {
        S1(((-2*c1+3*c2)/2),(c1-c2));
      }
    }
  }
}
Exemplo n.º 8
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  for (annot_i=0; annot_i<REPS; annot_i++)
  {
    annot_t_start = rtclock();
    
  


int t, i, j, k, l, m, n,ii,jj;

	#define S1(zT0,zT1,t,j)	{ey[0][j]=t;}
	#define S2(zT0,zT1,zT2,t,i,j)	{ey[i][j]=ey[i][j]-((double)(1))/2*(hz[i][j]-hz[i-1][j]);}
	#define S3(zT0,zT1,zT2,t,i,j)	{ex[i][j]=ex[i][j]-((double)(1))/2*(hz[i][j]-hz[i][j-1]);}
	#define S4(zT0,zT1,zT2,t,i,j)	{hz[i][j]=hz[i][j]-((double)(7))/10*(ey[1+i][j]+ex[i][1+j]-ex[i][j]-ey[i][j]);}

	int c1, c2, c3, c4, c5, c6, c7;

	register int lbv, ubv;

for (c1=0;c1<=floord(tmax-1,32);c1++) {
  for (c2=max(ceild(32*c1-31,32),0);c2<=min(floord(tmax+ny-1,32),floord(32*c1+ny+31,32));c2++) {
for (c3=max(max(max(max(ceild(32*c2-ny-30,32),0),ceild(64*c1-32*c2-61,32)),ceild(32*c1-31,32)),ceild(32*c1-992*c2-1891,992));c3<=min(min(floord(32*c2+nx+30,32),floord(tmax+nx-1,32)),floord(32*c1+nx+31,32));c3++) {
      if ((c1 <= floord(32*c3-nx,32)) && (c2 <= floord(32*c3-nx+ny,32)) && (c3 >= ceild(nx,32))) {
        for (c5=max(32*c3-nx+1,32*c2);c5<=min(32*c2+31,32*c3-nx+ny);c5++) {
          S4(c1,-c1+c3,-c1+c2,32*c3-nx,nx-1,-32*c3+c5+nx-1) ;
        }
      }
      if ((c1 <= floord(32*c2-ny,32)) && (c2 >= max(ceild(32*c3-nx+ny+1,32),ceild(ny,32)))) {
        for (c6=max(32*c3,32*c2-ny+1);c6<=min(32*c2+nx-ny,32*c3+31);c6++) {
          S4(c1,-c1+c3,-c1+c2,32*c2-ny,-32*c2+c6+ny-1,ny-1) ;
        }
      }
      if (c1 == c3) {
        for (c4=max(max(32*c2-ny+1,0),32*c3);c4<=min(min(32*c3+30,32*c2-ny+31),tmax-1);c4++) {
          for (c5=32*c2;c5<=c4+ny-1;c5++) {
            S1(c1,-c1+c2,c4,-c4+c5) ;
            S3(c1,0,-c1+c2,c4,0,-c4+c5) ;
            for (c6=c4+1;c6<=32*c3+31;c6++) {
              S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ;
              S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ;
              S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
            }
          }
          for (c6=c4+1;c6<=32*c3+31;c6++) {
            S4(c1,0,-c1+c2,c4,-c4+c6-1,ny-1) ;
          }
        }
      }
      if (c1 == c3) {
        for (c4=max(max(0,32*c3),32*c2-ny+32);c4<=min(min(tmax-1,32*c3+30),32*c2-1);c4++) {
          for (c5=32*c2;c5<=32*c2+31;c5++) {
            S1(c1,-c1+c2,c4,-c4+c5) ;
            S3(c1,0,-c1+c2,c4,0,-c4+c5) ;
            for (c6=c4+1;c6<=32*c3+31;c6++) {
              S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ;
              S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ;
              S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
            }
          }
        }
      }
      if (c1 == c3) {
        for (c4=max(max(32*c2,0),32*c3);c4<=min(min(tmax-1,32*c3+30),32*c2+30);c4++) {
          S1(c1,-c1+c2,c4,0) ;
          for (c6=c4+1;c6<=32*c3+31;c6++) {
            S2(c1,0,-c1+c2,c4,-c4+c6,0) ;
          }
          for (c5=c4+1;c5<=32*c2+31;c5++) {
            S1(c1,-c1+c2,c4,-c4+c5) ;
            S3(c1,0,-c1+c2,c4,0,-c4+c5) ;
            for (c6=c4+1;c6<=32*c3+31;c6++) {
              S2(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ;
              S3(c1,0,-c1+c2,c4,-c4+c6,-c4+c5) ;
              S4(c1,0,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
            }
          }
        }
      }
      for (c4=max(max(max(32*c1,0),32*c2-ny+1),32*c3-nx+1);c4<=min(min(min(32*c3-nx+31,32*c2-ny+31),32*c1+31),tmax-1);c4++) {
        for (c5=32*c2;c5<=c4+ny-1;c5++) {
          for (c6=32*c3;c6<=c4+nx-1;c6++) {
            S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
          }
          S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ;
        }
        for (c6=32*c3;c6<=c4+nx;c6++) {
          S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,ny-1) ;
        }
      }
      for (c4=max(max(max(32*c1,0),32*c3-nx+1),32*c2-ny+32);c4<=min(min(min(tmax-1,32*c1+31),32*c2-1),32*c3-nx+31);c4++) {
        for (c5=32*c2;c5<=32*c2+31;c5++) {
          for (c6=32*c3;c6<=c4+nx-1;c6++) {
            S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
          }
          S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ;
        }
      }
      for (c4=max(max(max(32*c3-nx+32,32*c1),0),32*c2-ny+1);c4<=min(min(min(32*c2-ny+31,32*c1+31),tmax-1),32*c3-1);c4++) {
        for (c5=32*c2;c5<=c4+ny-1;c5++) {
          for (c6=32*c3;c6<=32*c3+31;c6++) {
            S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
          }
        }
        for (c6=32*c3;c6<=32*c3+31;c6++) {
          S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,ny-1) ;
        }
      }
      for (c4=max(max(max(32*c2,32*c1),0),32*c3-nx+1);c4<=min(min(min(32*c2+30,tmax-1),32*c1+31),32*c3-nx+31);c4++) {
        for (c6=32*c3;c6<=c4+nx-1;c6++) {
          S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,0) ;
        }
        for (c5=c4+1;c5<=32*c2+31;c5++) {
          for (c6=32*c3;c6<=c4+nx-1;c6++) {
            S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
          }
          S4(c1,-c1+c3,-c1+c2,c4,nx-1,-c4+c5-1) ;
        }
      }
      for (c4=max(max(max(32*c1,0),32*c3-nx+32),32*c2-ny+32);c4<=min(min(min(32*c3-1,tmax-1),32*c1+31),32*c2-1);c4++) {
/*@ begin Loop(
 transform Composite(                                                                        
  tile = [('c5',T1,'ii'),('c6',T2,'jj')],
  permut = [PERMUTS],
  unrolljam = [('c5',U1),('c6',U2)],
  vector = (VEC, ['ivdep','vector always'])
 )                        
        for (c5=32*c2;c5<=32*c2+31;c5++) 
          for (c6=32*c3;c6<=32*c3+31;c6++) 
{
            S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
}
) @*/{
  for (c6=32*c3; c6<=32*c3+28; c6=c6+4) {
    register int cbv_1, cbv_2;
    cbv_1=32*c2;
    cbv_2=32*c2+31;
#pragma ivdep
#pragma vector always
    for (c5=cbv_1; c5<=cbv_2; c5=c5+1) {
      S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5);
      S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5);
      S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5);
      S2(c1,-c1+c3,-c1+c2,c4,-c4+c6+3,-c4+c5);
      S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5);
      S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5);
      S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5);
      S3(c1,-c1+c3,-c1+c2,c4,-c4+c6+3,-c4+c5);
      S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1);
      S4(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5-1);
      S4(c1,-c1+c3,-c1+c2,c4,-c4+c6+1,-c4+c5-1);
      S4(c1,-c1+c3,-c1+c2,c4,-c4+c6+2,-c4+c5-1);
    }
  }
  for (; c6<=32*c3+31; c6=c6+1) {
    register int cbv_3, cbv_4;
    cbv_3=32*c2;
    cbv_4=32*c2+31;
#pragma ivdep
#pragma vector always
    for (c5=cbv_3; c5<=cbv_4; c5=c5+1) {
      S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5);
      S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5);
      S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1);
    }
  }
}
/*@ end @*/
      }
      for (c4=max(max(max(32*c2,32*c3-nx+32),32*c1),0);c4<=min(min(min(32*c3-1,32*c2+30),tmax-1),32*c1+31);c4++) {
        for (c6=32*c3;c6<=32*c3+31;c6++) {
          S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,0) ;
        }
        for (c5=c4+1;c5<=32*c2+31;c5++) {
          for (c6=32*c3;c6<=32*c3+31;c6++) {
            S2(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S3(c1,-c1+c3,-c1+c2,c4,-c4+c6,-c4+c5) ;
            S4(c1,-c1+c3,-c1+c2,c4,-c4+c6-1,-c4+c5-1) ;
          }
        }
      }
      if ((c1 == c3) && (c2 <= min(floord(32*c3-1,32),floord(tmax-32,32)))) {
        S1(c1,-c1+c2,32*c2+31,0) ;
        for (c6=32*c2+32;c6<=32*c3+31;c6++) {
          S2(c1,0,-c1+c2,32*c2+31,-32*c2+c6-31,0) ;
        }
      }
      if ((-c1 == -c3) && (c1 >= ceild(32*c2-31,32)) && (c1 <= min(floord(tmax-32,32),floord(32*c2-1,32)))) {
        S1(c1,-c1+c2,32*c1+31,0) ;
        for (c5=32*c1+32;c5<=32*c2+31;c5++) {
          S1(c1,-c1+c2,32*c1+31,-32*c1+c5-31) ;
          S3(c1,0,-c1+c2,32*c1+31,0,-32*c1+c5-31) ;
        }
      }
      if ((-c1 == -c3) && (c1 <= min(floord(tmax-32,32),c2-1))) {
        for (c5=32*c2;c5<=min(32*c2+31,32*c1+ny+30);c5++) {
          S1(c1,-c1+c2,32*c1+31,-32*c1+c5-31) ;
          S3(c1,0,-c1+c2,32*c1+31,0,-32*c1+c5-31) ;
        }
      }
      if ((-c1 == -c2) && (-c1 == -c3) && (c1 <= floord(tmax-32,32))) {
        S1(c1,0,32*c1+31,0) ;
      }
      if ((c1 >= c2) && (c2 <= min(c3-1,floord(tmax-32,32)))) {
        for (c6=32*c3;c6<=min(32*c2+nx+30,32*c3+31);c6++) {
          S2(c1,-c1+c3,-c1+c2,32*c2+31,-32*c2+c6-31,0) ;
        }
      }
    }
  }
}



    annot_t_end = rtclock();
    annot_t_total += annot_t_end - annot_t_start;
  }
  
  annot_t_total = annot_t_total / REPS;
  printf("%f\n", annot_t_total);
  
  return 1;
}
Exemplo n.º 9
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  omp_set_nested(1);
  omp_set_num_threads(2);

  for (annot_i=0; annot_i<REPS; annot_i++)
  {
    annot_t_start = rtclock();
   
    register int i,j,k;


    
	#define S1(zT0,zT1,zT2,zT3,k,j)	{A[k][j]=A[k][j]/A[k][k];}
	#define S2(zT0,zT1,zT2,zT3,zT4,zT5,k,i,j)	{A[i][j]=A[i][j]-A[i][k]*A[k][j];}

	int c1, c2, c3, c4, c5, c6, c7, c8, c9;

	register int lb, ub, lb1, ub1, lb2, ub2;
	register int lbv, ubv;

/* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 2.21s. */
for (c1=-2;c1<=floord(3*N-4,256);c1++) {
	lb1=max(max(0,ceild(256*c1-N-253,512)),ceild(256*c1-2*N+3,256));
	ub1=min(floord(128*c1+255,128),floord(N-1,256));
#pragma omp parallel for shared(c1,lb1,ub1) private(lb2,ub2,c2,c3,c4,c5,c6,c7,c8,c9)
	for (c2=lb1; c2<=ub1; c2++) {
	lb2=max(max(max(ceild(256*c1-256*c2-N+2,256),ceild(128*c1-256*c2-127,128)),ceild(128*c1-128*c2-32385,32768)),ceild(128*c1-128*c2-127,256));
	ub2=min(floord(N-1,256),floord(256*c1-256*c2+255,256));
#pragma omp parallel for shared(c1,c2,lb1,ub1,lb2,ub2) private(c3,c4,c5,c6,c7,c8,c9)
	for (c3=lb2; c3<=ub2; c3++) {
      for (c4=max(max(8*c1-8*c2-8*c3,0),8*c1-8*c2-1800*c3-1778);c4<=min(min(min(min(floord(3968*c3+3937,16),8*c1-8*c2-8*c3+7),floord(128*c2+127,16)),floord(N-2,32)),floord(128*c3+127,16));c4++) {
        for (c5=max(max(ceild(16*c4-15,16),0),8*c2);c5<=min(floord(N-1,32),8*c2+7);c5++) {
          for (c6=max(max(max(max(ceild(16*c4-465,496),ceild(8*c1-8*c2-16*c3-c4-217,223)),ceild(-8*c1+8*c2+16*c3+c4-217,225)),8*c3),ceild(16*c4-15,16));c6<=min(8*c3+7,floord(N-1,32));c6++) {
            if ((c1 == c2+2*c3) && (c4 == c6)) {
              for (c7=max(0,32*c6);c7<=min(min(32*c5+30,32*c6+30),N-2);c7++) {
                for (c8=max(c7+1,32*c5);c8<=min(32*c5+31,N-1);c8++) {
                  if ((c1-c2)%2 == 0) {
                    S1((c1-c2)/2,c2,c4,c5,c7,c8) ;
                  }
                  for (c9=c7+1;c9<=min(32*c6+31,N-1);c9++) {
                    if ((c1-c2)%2 == 0) {
                      if ((c1-c2)%2 == 0) {
                        S2((c1-c2)/2,(c1-c2)/2,c2,c4,c4,c5,c7,c9,c8) ;
                      }
                    }
                  }
                }
              }
            }
            for (c7=max(32*c4,0);c7<=min(min(32*c6-1,32*c5+30),32*c4+31);c7++) {
/*@ begin Loop(
transform UnrollJam(ufactor=8)
              for (c8=max(c7+1,32*c5);c8<=min(32*c5+31,N-1);c8++) 
transform Unroll(ufactor=8)
                for (c9=32*c6;c9<=min(N-1,32*c6+31);c9++) 
{
                  S2(c1-c2-c3,c3,c2,c4,c6,c5,c7,c9,c8) ;
}
) @*/{ 

  for (c8 = max(c7 + 1, 32 * c5); c8 <= min(32 * c5 + 31, N - 1) - 7; c8 = c8 + 8)     { 

      for (c9 = 32 * c6; c9 <= min(N - 1, 32 * c6 + 31) - 7; c9 = c9 + 8)         { 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), (c8 + 7)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), (c8 + 7)); 
        } 

      for (; c9 <= min(N - 1, 32 * c6 + 31); c9 = c9 + 1)         { 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 1)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 2)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 3)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 4)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 5)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 6)); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, (c8 + 7)); 
        } 
    } 

  for (; c8 <= min(32 * c5 + 31, N - 1); c8 = c8 + 1)     { 

      for (c9 = 32 * c6; c9 <= min(N - 1, 32 * c6 + 31) - 7; c9 = c9 + 8)         { 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 1), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 2), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 3), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 4), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 5), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 6), c8); 
          S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, (c9 + 7), c8); 
        } 

      for (; c9 <= min(N - 1, 32 * c6 + 31); c9 = c9 + 1)         S2(c1 - c2 - c3, c3, c2, c4, c6, c5, c7, c9, c8); 
    } 
} 
/*@ end @*/
            }
            if ((c1 == c2+2*c3) && (-c4 == -c6) && (c4 <= min(floord(N-33,32),floord(32*c5-1,32)))) {
              for (c8=max(32*c5,32*c4+32);c8<=min(N-1,32*c5+31);c8++) {
                if ((c1-c2)%2 == 0) {
                  S1((c1-c2)/2,c2,c4,c5,32*c4+31,c8) ;
                }
              }
            }
          }
        }
      }
    }
  }
}
/* End of CLooG code */



    annot_t_end = rtclock();
    annot_t_total += annot_t_end - annot_t_start;
  }
  
  annot_t_total = annot_t_total / REPS;
  printf("%f\n", annot_t_total);
  
  return ((int) A[0][0]); 

}
Exemplo n.º 10
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  for (annot_i=0; annot_i<REPS; annot_i++)
    {
      annot_t_start = rtclock();



#include <math.h>
#include <assert.h>
#define ceild(n,d)  ceil(((double)(n))/((double)(d)))
#define floord(n,d) floor(((double)(n))/((double)(d)))
#define max(x,y)    ((x) > (y)? (x) : (y))
#define min(x,y)    ((x) < (y)? (x) : (y))



 int c1, c2, c3, c4, c5, c6, c7, c8, c9;
 register int lbv, ubv;

if (N >= 2) {
  for (c1=0;c1<=floord(N-2,256);c1++) {
    for (c2=max(ceild(128*c1-127,128),0);c2<=floord(N-1,256);c2++) {
      for (c3=max(ceild(128*c1-32385,32640),ceild(128*c1-127,128));c3<=floord(N-1,256);c3++) {
        for (c4=max(max(8*c1-1792*c3-1778,0),8*c1);c4<=min(min(min(min(floord(N-2,32),floord(128*c2+127,16)),8*c1+7),floord(128*c3+127,16)),floord(3968*c3+3937,16));c4++) {
          for (c5=max(max(0,ceild(16*c4-15,16)),8*c2);c5<=min(floord(N-1,32),8*c2+7);c5++) {
            for (c6=max(max(max(max(ceild(16*c4-465,496),ceild(-8*c1+8*c3+c4-217,225)),ceild(8*c1-8*c3-c4-217,223)),8*c3),ceild(16*c4-15,16));c6<=min(8*c3+7,floord(N-1,32));c6++) {
              if ((c1 == c3) && (c4 == c6)) {
                for (c7=max(0,32*c6);c7<=min(min(32*c5+30,N-2),32*c6+30);c7++) {
{
 lbv=max(c7+1,32*c5); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
                    {A[c7][c9]=A[c7][c9]/A[c7][c7];} ;
                  }
}
                  for (c8=c7+1;c8<=min(N-1,32*c6+31);c8++) {
{
 lbv=max(c7+1,32*c5); ubv=min(32*c5+31,N-1);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
                      {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];} ;
                    }
}
                  }
                }
              }
     {
  for (c7 = max(0, 32 * c4); c7 <= min(min(32 * c6 - 1, 32 * c5 + 30), 32 * c4 + 31) - 3; c7 = c7 + 4) {
      for (c8 = 32 * c6; c8 <= min(N - 1, 32 * c6 + 31) - 3; c8 = c8 + 4) {
{
 lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];};
              {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][c7]*A[c7][c9];};
              {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][c7]*A[c7][c9];};
              {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][c7]*A[c7][c9];};
            }
}
{
 lbv=max(32*c5,(c7+1)+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 1)]*A[(c7 + 1)][c9];};
              {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][(c7 + 1)]*A[(c7 + 1)][c9];};
              {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][(c7 + 1)]*A[(c7 + 1)][c9];};
              {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][(c7 + 1)]*A[(c7 + 1)][c9];};
            }
}
{
 lbv=max(32*c5,(c7+2)+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 2)]*A[(c7 + 2)][c9];};
              {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][(c7 + 2)]*A[(c7 + 2)][c9];};
              {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][(c7 + 2)]*A[(c7 + 2)][c9];};
              {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][(c7 + 2)]*A[(c7 + 2)][c9];};
            }
}
{
 lbv=max(32*c5,(c7+3)+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 3)]*A[(c7 + 3)][c9];};
              {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][(c7 + 3)]*A[(c7 + 3)][c9];};
              {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][(c7 + 3)]*A[(c7 + 3)][c9];};
              {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][(c7 + 3)]*A[(c7 + 3)][c9];};
            }
}
        }
      for (; c8 <= min(N - 1, 32 * c6 + 31); c8 = c8 + 1) {
{
 lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];};
            }
}
{
 lbv=max(32*c5,(c7+1)+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 1)]*A[(c7 + 1)][c9];};
            }
}
{
 lbv=max(32*c5,(c7+2)+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 2)]*A[(c7 + 2)][c9];};
            }
}
{
 lbv=max(32*c5,(c7+3)+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
              {A[c8][c9]=A[c8][c9]-A[c8][(c7 + 3)]*A[(c7 + 3)][c9];};
            }
}
        }
    }
  for (; c7 <= min(min(32 * c6 - 1, 32 * c5 + 30), 32 * c4 + 31); c7 = c7 + 1) {
      for (c8 = 32 * c6; c8 <= min(N - 1, 32 * c6 + 31) - 3; c8 = c8 + 4)
{
 lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
            {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];};
            {A[(c8 + 1)][c9]=A[(c8 + 1)][c9]-A[(c8 + 1)][c7]*A[c7][c9];};
            {A[(c8 + 2)][c9]=A[(c8 + 2)][c9]-A[(c8 + 2)][c7]*A[c7][c9];};
            {A[(c8 + 3)][c9]=A[(c8 + 3)][c9]-A[(c8 + 3)][c7]*A[c7][c9];};
          }
}
      for (; c8 <= min(N - 1, 32 * c6 + 31); c8 = c8 + 1)
{
 lbv=max(32*c5,c7+1); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
            {A[c8][c9]=A[c8][c9]-A[c8][c7]*A[c7][c9];};
          }
}
    }
}

              if ((c1 == c3) && (-c4 == -c6) && (c4 <= min(floord(N-33,32),floord(32*c5-1,32)))) {
{
 lbv=max(32*c4+32,32*c5); ubv=min(N-1,32*c5+31);
#pragma ivdep
#pragma vector always
 for (c9=lbv; c9<=ubv; c9++) {
                  {A[32*c4+31][c9]=A[32*c4+31][c9]/A[32*c4+31][32*c4+31];} ;
                }
}
              }
            }
          }
        }
      }
    }
  }
}




      annot_t_end = rtclock();
      annot_t_total += annot_t_end - annot_t_start;
    }

  annot_t_total = annot_t_total / REPS;

#ifndef TEST
  printf("%f\n", annot_t_total);
#else
  {
    int i, j;
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        if (j%100==0)
          printf("\n");
        printf("%f ",A[i][j]);
      }
      printf("\n");
    }
  }
#endif

  return ((int) A[0][0]);

}
Exemplo n.º 11
0
int inspectBlock(SpMatrix *m, unsigned int **rowIndices, unsigned int **indices,
                  unsigned int *numblocks, unsigned int **nnzCount_block, unsigned int **yCount_block,
                  unsigned int bsx, unsigned int bsy)
{
    unsigned int nblocks=0;
    unsigned int nrows = m->numRows;
    unsigned int ncols = m->numCols;
    unsigned int *yflag = (unsigned int *)malloc(sizeof(int)*bsy);
    unsigned int *yBFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy));
    unsigned int *lb = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *ub = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *bptr = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *colFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy));
    unsigned int *nblocksRow = (unsigned int *)malloc(sizeof(int)*(int)ceild(nrows,bsx));
    unsigned int **indRows = (unsigned int **)malloc(sizeof(int*)*(int)ceild(nrows,bsx));

    *rowIndices = (unsigned int *)malloc(sizeof(int)*((int)ceild(nrows,bsx)+1));
    if (*rowIndices == NULL) return ERR_INSUFFICIENT_MEMORY;


    // for each block of row
    unsigned int it, iti;
    for (it = 0, iti = 0; it < nrows; it += bsx, iti++) {
        // start of a row block
        (*rowIndices)[iti]=nblocks;
        nblocksRow[iti]=0;
        for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
	    lb[i-it] = (m->rowPtrs)[i];
            if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1;
            else ub[i-it] = (m->rowPtrs)[i+1]-1;
            bptr[i-it] = lb[i-it];
        }
        // for each block of column within a row block
        for (unsigned int jt = 0, jti = 0; jt < ncols; jt += bsy, jti++) {
            colFlag[jti]=0;
            yBFlag[jti]=0;
            for (unsigned int k=0;k<bsy;k++)
                yflag[k]=0;
            unsigned int blockStart = nblocks;
            for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
                unsigned int j = bptr[i-it];
                for (; j <= ub[i-it]; j++) {
                    unsigned int cInd = (m->nzentries)[j].colNum;
                    if (cInd >= jt+bsy)
                        break;
                    else {
                        if (!yflag[cInd-jt]) yflag[cInd-jt]++;
                        colFlag[jti]++; //colFlag[jti] stores the #nnzs in the block
                        if (blockStart == nblocks) {
                            nblocks++;
                            nblocksRow[iti]++;
                        }
                    }
                }
                bptr[i-it] = j;
            }
            for (unsigned int k=0;k<bsy;k++)
                yBFlag[jti] += yflag[k];
        }
        indRows[iti] = (unsigned int *)malloc(sizeof(int)*3*nblocksRow[iti]);
        if (indRows[iti] == NULL) return ERR_INSUFFICIENT_MEMORY;
        for (unsigned int k=0, indRowk=0; k < ceild(ncols,bsy); k++) {
            if (colFlag[k]) {
                indRows[iti][3*indRowk]=k;
                indRows[iti][3*indRowk+1]=colFlag[k];
                indRows[iti][3*indRowk+2]=yBFlag[k];
                indRowk++;
            }
	    // indRowk at the end of the for loop will be equal to nblocksRow[iti]
        }

    }
    (*rowIndices)[iti]=nblocks;

    *numblocks = nblocks;
    *indices = (unsigned int *)malloc(sizeof(int)*nblocks);
    if (*indices == NULL) return ERR_INSUFFICIENT_MEMORY;
    *nnzCount_block = (unsigned int *)malloc(sizeof(int)*nblocks);
    if (*nnzCount_block == NULL) return ERR_INSUFFICIENT_MEMORY;
    *yCount_block = (unsigned int *)malloc(sizeof(int)*nblocks);
    if (*yCount_block == NULL) return ERR_INSUFFICIENT_MEMORY;

    // Merge all indRows to generate indices, nnzCount_block and yCount_block
    nblocks=0;
    for (unsigned int k=0; k < ceild(nrows,bsx); k++) {
        for (unsigned int l=0; l<nblocksRow[k]; l++) {
            (*indices)[nblocks]=indRows[k][3*l];
            (*nnzCount_block)[nblocks]=indRows[k][3*l+1];
            (*yCount_block)[nblocks]=indRows[k][3*l+2];
            nblocks++;
        }
    }
    for (unsigned int k=0; k < ceild(nrows,bsx); k++)
        free(indRows[k]);
    free(nblocksRow);
    free(indRows);
    free(lb);
    free(ub);
    free(bptr);
    free(colFlag);
    free(yflag);
    free(yBFlag);
    return 0;
}
Exemplo n.º 12
0
int inspectInputBlock(SpMatrix *m, unsigned int **inputList, unsigned int **rowIndices, unsigned int **indices,
                  unsigned int *numblocks, unsigned int *inputListCount, unsigned int bsx, unsigned int bsy)
{
    unsigned int nblocks=0, nblocksPerRowBlock;
    unsigned int nrows = m->numRows;
    unsigned int ncols = m->numCols;
    unsigned int *lb = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *ub = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *bptr = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *inpListRowBlock = (unsigned int *)malloc(sizeof(int)*bsx*bsy);

    *rowIndices = (unsigned int *)malloc(sizeof(int)*((int)ceild(nrows,bsx)+1));

    unsigned int it, iti;
    unsigned int maxNZPerRowBlock;
    // for each block of row
    for (it = 0, iti = 0; it < nrows; it += bsx, iti++) {
        // start of a row block
        (*rowIndices)[iti]=nblocks;
	maxNZPerRowBlock=0;
        for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
            if (i==(nrows-1)) 
                maxNZPerRowBlock = max(maxNZPerRowBlock,(m->numNZEntries-(m->rowPtrs)[i]));
	    else
                maxNZPerRowBlock = max(maxNZPerRowBlock,((m->rowPtrs)[i+1]-(m->rowPtrs)[i]));
        }
	nblocks += (int) ceild(maxNZPerRowBlock,bsy);
    }
    (*rowIndices)[iti]=nblocks;
    *numblocks = nblocks;
    *indices = (unsigned int *)malloc(sizeof(int)*(nblocks+1));

    unsigned int countPerBlock,countInputList=0;
    nblocks=0;
    // for each block of row
    for (it = 0, iti = 0; it < nrows; it += bsx, iti++) {
        // start of a row block
	maxNZPerRowBlock=0;
        for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
	    lb[i-it] = (m->rowPtrs)[i];
            if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1; else ub[i-it] = (m->rowPtrs)[i+1]-1;
	    maxNZPerRowBlock = max(maxNZPerRowBlock,ub[i-it]-lb[i-it]+1);
	    bptr[i-it] = lb[i-it];
        }
	nblocksPerRowBlock = (int) ceild(maxNZPerRowBlock,bsy);
        // for each block of column within a row block
        for (unsigned int jt = 0, jti = 0; jt < nblocksPerRowBlock; jt += bsy, jti++,nblocks++) {
	    (*indices)[nblocks]=countInputList;
	    countPerBlock=0;
            for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
                unsigned int j = bptr[i-it];
                for (; j <= min(bptr[i-it]+bsy-1,ub[i-it]); j++) {
                    unsigned int cInd = (m->nzentries)[j].colNum;
		    if (i==it) inpListRowBlock[countPerBlock++]=cInd;
		    else {
			if (!isPresent(inpListRowBlock,countPerBlock,cInd))
			    inpListRowBlock[countPerBlock++]=cInd;
		    }
                }
                bptr[i-it] = j;
            }
	    countInputList += countPerBlock;
        }
    }
    (*indices)[nblocks]=countInputList;

    *inputList = (unsigned int *)malloc(sizeof(int)*countInputList);
    *inputListCount = countInputList;
    countInputList = 0;
    nblocks=0;
    // for each block of row
    for (it = 0, iti = 0; it < nrows; it += bsx, iti++) {
        // start of a row block
	maxNZPerRowBlock=0;
        for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
	    lb[i-it] = (m->rowPtrs)[i];
            if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1; else ub[i-it] = (m->rowPtrs)[i+1]-1;
	    maxNZPerRowBlock = max(maxNZPerRowBlock,ub[i-it]-lb[i-it]+1);
	    bptr[i-it] = lb[i-it];
        }
	nblocksPerRowBlock = (int) ceild(maxNZPerRowBlock,bsy);
        // for each block of column within a row block
        for (unsigned int jt = 0, jti = 0; jt < nblocksPerRowBlock; jt += bsy, jti++,nblocks++) {
	    countPerBlock=0;
            for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
                unsigned int j = bptr[i-it];
                for (; j <= min(bptr[i-it]+bsy-1,ub[i-it]); j++) {
                    unsigned int cInd = (m->nzentries)[j].colNum;
		    if (i==it) inpListRowBlock[countPerBlock++]=cInd;
		    else {
			if (!isPresent(inpListRowBlock,countPerBlock,cInd))
			    inpListRowBlock[countPerBlock++]=cInd;
		    }
                }
                bptr[i-it] = j;
            }
	    //Sort inpListRowBlock
	    sort(inpListRowBlock,countPerBlock);
	    if ( (countPerBlock > (bsx*bsy/2)) && (inpListRowBlock[countPerBlock-1] - inpListRowBlock[0] <512) ) {
	        for (int k=0;k<countPerBlock;k++)
		    (*inputList)[countInputList+k]=inpListRowBlock[k];
	    }
	    else {
	        for (int k=0;k<countPerBlock;k++)
		    //(*inputList)[countInputList+k]=inpListRowBlock[k];
		    (*inputList)[countInputList+k]=ncols;
	    }
	    countInputList += countPerBlock;
        }
    }

    free(inpListRowBlock);
    free(lb);
    free(ub);
    free(bptr);

    return 0;
}
Exemplo n.º 13
0
int inspectVarBlock(SpMatrix *m, float **valFill, unsigned int **indicesFill, unsigned int **rowIndicesFill, unsigned int **rowIndices, 
		  unsigned int **indices, unsigned int *numblocks, unsigned int **nnzCount_block, unsigned int **yCount_block,
                  unsigned int *nnz_fill, unsigned int bsx, unsigned int bsy, unsigned int varC)
{

    unsigned int nblocks=0, nnz_filled=0;
    unsigned int nrows = m->numRows;
    unsigned int ncols = m->numCols;
    unsigned int *yflag = (unsigned int *)malloc(sizeof(int)*bsy);
    unsigned int *yBFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy));
    unsigned int *lb = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *ub = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *bptr = (unsigned int *)malloc(sizeof(int)*bsx);
    unsigned int *colFlag = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy));
    unsigned int *startCol = (unsigned int *)malloc(sizeof(int)*(int)ceild(ncols,bsy));
    unsigned int *nblocksRow = (unsigned int *)malloc(sizeof(int)*(int)ceild(nrows,bsx));
    unsigned int **indRows = (unsigned int **)malloc(sizeof(int*)*(int)ceild(nrows,bsx));

    *rowIndices = (unsigned int *)malloc(sizeof(int)*((int)ceild(nrows,bsx)+1));
    if (*rowIndices == NULL) return ERR_INSUFFICIENT_MEMORY;

    // Assume bsy is a multiple of varC
    // Do blocking as if block size along column is varC
    // Then combine blocks of size bsy

    // for each block of row
    unsigned int it, iti;
    for (it = 0, iti = 0; it < nrows; it += bsx, iti++) {
        // start of a row block
        (*rowIndices)[iti]=nblocks;
        nblocksRow[iti]=0;
        for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
            lb[i-it] = (m->rowPtrs)[i];
            if (i==(nrows-1)) ub[i-it] = m->numNZEntries-1;
            else ub[i-it] = (m->rowPtrs)[i+1]-1;
            bptr[i-it] = lb[i-it];
        }
        // for each block of column within a row block
        for (unsigned int jt = 0, jti = 0; jt < ncols; jt += bsy, jti++) {

    	    // Assume bsy is a multiple of varC
    	    // Skip zero column blocks (of size varC)

	    unsigned int minjt=jt;
	    for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
                unsigned int j = bptr[i-it];
                unsigned int cInd = (m->nzentries)[j].colNum;
                for (unsigned int jtv=jt; ; jtv+=varC) {
                    if (cInd < jtv+varC) {
			if (i==it) minjt = jtv; 
			else {
			    if (jtv < minjt) minjt = jtv;
			}
                        break;
		    }
                }
            }
	    jt=minjt;
	    startCol[jti]=jt;
            colFlag[jti]=0;
            yBFlag[jti]=0;
            for (unsigned int k=0;k<bsy;k++)
                yflag[k]=0;
            unsigned int blockStart = nblocks;

            for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
                unsigned int j = bptr[i-it];
		unsigned int rownnz = 0;
                for (; j <= ub[i-it]; j++) {
                    unsigned cInd = (m->nzentries)[j].colNum;
                    if (cInd >= jt+bsy) {
			//if ((rownnz%HALFWARP) || (!rownnz)) nnz_filled += (HALFWARP - (rownnz%HALFWARP));
			if (rownnz%HALFWARP) nnz_filled += (HALFWARP - (rownnz%HALFWARP));
                        break;
		    }
                    else {
                        if (!yflag[cInd-jt]) yflag[cInd-jt]++;
                        colFlag[jti]++; //colFlag[jti] stores the #nnzs in the block
			nnz_filled++;
			rownnz++;
                        if (blockStart == nblocks) {
                            nblocks++;
                            nblocksRow[iti]++;
                        }
                    }
                }
                bptr[i-it] = j;
		//if ( (j==(ub[i-it]+1)) && ((rownnz%HALFWARP) || (!rownnz)) ) nnz_filled += (HALFWARP - (rownnz%HALFWARP));
		if ( (j==(ub[i-it]+1)) && (rownnz%HALFWARP)  ) nnz_filled += (HALFWARP - (rownnz%HALFWARP));
            }
            for (unsigned int k=0;k<bsy;k++)
                yBFlag[jti] += yflag[k];
        }
        indRows[iti] = (unsigned int *)malloc(sizeof(int)*3*nblocksRow[iti]);
        if (indRows[iti] == NULL) return ERR_INSUFFICIENT_MEMORY;
        for (unsigned int indRowk=0; indRowk < nblocksRow[iti]; indRowk++) {
            indRows[iti][3*indRowk]=startCol[indRowk];
            indRows[iti][3*indRowk+1]=colFlag[indRowk];
            indRows[iti][3*indRowk+2]=yBFlag[indRowk];
        }

    }
    (*rowIndices)[iti]=nblocks;

    *numblocks = nblocks;
    *indices = (unsigned int *)malloc(sizeof(int)*nblocks);
    if (*indices == NULL) return ERR_INSUFFICIENT_MEMORY;
    *nnzCount_block = (unsigned int *)malloc(sizeof(int)*nblocks);
    if (*nnzCount_block == NULL) return ERR_INSUFFICIENT_MEMORY;
    *yCount_block = (unsigned int *)malloc(sizeof(int)*nblocks);
    if (*yCount_block == NULL) return ERR_INSUFFICIENT_MEMORY;

    // Merge all indRows to generate indices, nnzCount_block and yCount_block
    nblocks=0;
    for (unsigned int k=0; k < ceild(nrows,bsx); k++) {
        for (unsigned int l=0; l<nblocksRow[k]; l++) {
            (*indices)[nblocks]=indRows[k][3*l];
            (*nnzCount_block)[nblocks]=indRows[k][3*l+1];
            (*yCount_block)[nblocks]=indRows[k][3*l+2];
            nblocks++;
        }
    }

    // Fill in value

    *valFill = (float *)malloc(sizeof(float)*(nnz_filled));
    *indicesFill = (unsigned int *)malloc(sizeof(int)*(nnz_filled));
    *rowIndicesFill = (unsigned int *)malloc(sizeof(int)*(nrows+1));
    // One more loop to fill in val
    nnz_filled=0;
    for (it = 0, iti = 0; it < nrows; it += bsx, iti++) {
        unsigned int lbb = (*rowIndices)[iti];
        unsigned int ubb = (*rowIndices)[iti+1]-1;
        for (unsigned int i = it; i < min(it+bsx,nrows); i++) {
	    (*rowIndicesFill)[i]=nnz_filled;
	    unsigned int lbi, ubi;
            lbi = (m->rowPtrs)[i];
            if (i==(nrows-1)) ubi = m->numNZEntries-1;
            else ubi = (m->rowPtrs)[i+1]-1;
	    unsigned int j=lbi;
	    unsigned int rownnz=0;
            unsigned int jti=(*indices)[lbb];
            for (unsigned int jb = lbb; jb <= ubb; jb++) {
                jti = (*indices)[jb];
		rownnz=0;
                for (; j <= ubi; j++) {
                    unsigned int cInd = (m->nzentries)[j].colNum;
                    if (cInd >= (jti+bsy)) {
			//if ((rownnz%HALFWARP) || (!rownnz)) {
			if (rownnz%HALFWARP) {
			    for (unsigned int p=0;p<(HALFWARP - (rownnz%HALFWARP));p++) {
				(*valFill)[nnz_filled]=0;
				//(*indicesFill)[nnz_filled]=ncols+jti;
				(*indicesFill)[nnz_filled]=jti;
				nnz_filled++;
			    }
			}
			break;
		    }
                    else {
			(*valFill)[nnz_filled]=(m->nzentries)[j].val;
			(*indicesFill)[nnz_filled]=cInd;
			nnz_filled++;
			rownnz++;
		    }
                }
            }
	    //if ((rownnz%HALFWARP) || (!rownnz)) {
	    if (rownnz%HALFWARP) {
                for (unsigned int p=0;p<(HALFWARP - (rownnz%HALFWARP));p++) {
                    (*valFill)[nnz_filled]=0;
		    //(*indicesFill)[nnz_filled]=ncols+jti;
		    (*indicesFill)[nnz_filled]=jti;
		    nnz_filled++;
		}
            }
        }
    }
    (*rowIndicesFill)[nrows]=nnz_filled;
    *nnz_fill = nnz_filled;

    for (unsigned int k=0; k < ceild(nrows,bsx); k++)
        free(indRows[k]);
    free(nblocksRow);
    free(indRows);
    free(lb);
    free(ub);
    free(bptr);
    free(colFlag);
    free(startCol);
    free(yflag);
    free(yBFlag);
    return 0;

}
Exemplo n.º 14
0
double djbi1d_from_pluto(struct args_dimt args, double *jbi_in,
  double *jbi_out) {

  int N,T;
  int c1, c2, c3, c4, c5;
  int i, j, k, l, t;
  (void)i;
  (void)j;
  (void)k;
  (void)l;
  (void)t;
  register int lb, ub;

  N = args.width;
  T = args.iters;

  double *a, *b;
  a = jbi_in;
  b = jbi_out;

  clock_gettime(CLOCK_MONOTONIC, &tbegin);
  /* Generated from jacobi-imper.sched.cloog by CLooG v0.14.1 64 bits in 0.01s. */
 for (c1=-1;c1<=floord(N+3*T-4,2048);c1++) {
     lb = max(max(ceild(2048*c1-T+1,2048),ceild(4096*c1-2045,6144)),0);
     ub = min(min(floord(2048*c1+2047,2048),floord(4096*c1+N+4093,6144)),floord(N+2*T-3,2048));

#pragma omp parallel for shared(c1,lb,ub,a,b) private(c2,c3,c4,c5,i,j,k,l)
   for (c2=lb;c2<=ub;c2++) {
     if (c1 >= max(c2,ceild(6144*c2-N+2,4096))) {
       c3 = 4096*c1-4096*c2 ;
        for (c4=max(4096*c1-4096*c2+2,2048*c2);c4<=min(4096*c1-4096*c2+N-2,2048*c2+2047);c4++) {
          c5 = 0 ;
          if ((c1-c2)%2 == 0) {
            i = (c1-c2)/2 ;
            j = -c1+2*c2 ;
            k = 2048*c1-2048*c2 ;
            l = -4096*c1+4096*c2+c4 ;
            S1((c1-c2)/2,-c1+2*c2,2048*c1-2048*c2,-4096*c1+4096*c2+c4) ;
          }
        }
      }
      if ((c1 <= floord(4096*c2-1,4096)) && (c2 <= floord(N-2,2048))) {
        c3 = 0 ;
        for (c4=max(2048*c2,2);c4<=min(2048*c2+2047,N-2);c4++) {
          c5 = 0 ;
          if ((c1-c2)%2 == 0) {
            i = (c1-c2)/2 ;
            j = -c1+2*c2 ;
            k = 0 ;
            l = c4 ;
            S1((c1-c2)/2,-c1+2*c2,0,c4) ;
          }
        }
      }
      for (c3=max(max(1,2048*c2-N+2),4096*c1-4096*c2+1);c3<=min(min(4096*c1-4096*c2+4094,2048*c2+2045),2*T-2);c3++) {
        for (c4=max(2048*c2,c3+2);c4<=min(c3+N-2,2048*c2+2047);c4++) {
          c5 = 0 ;
          if ((c1-c2)%2 == 0) {
            i = (c1-c2)/2 ;
            j = -c1+2*c2 ;
            if (c3%2 == 0) {
              k = c3/2 ;
              l = -c3+c4 ;
              S1((c1-c2)/2,-c1+2*c2,c3/2,-c3+c4) ;
            }
          }
          c5 = 1 ;
          if ((c1-c2)%2 == 0) {
            i = (c1-c2)/2 ;
            j = -c1+2*c2 ;
            if ((c3-1)%2 == 0) {
              k = (c3-1)/2 ;
              l = -c3+c4 ;
              S2((c1-c2)/2,-c1+2*c2,(c3-1)/2,-c3+c4) ;
            }
          }
        }
      }
      if (c1 <= min(floord(3072*c2-1025,2048),floord(2048*c2+T-2048,2048))) {
        c3 = 4096*c1-4096*c2+4095 ;
        for (c4=max(4096*c1-4096*c2+4097,2048*c2);c4<=min(4096*c1-4096*c2+N+4093,2048*c2+2047);c4++) {
          c5 = 1 ;
          if ((c1-c2)%2 == 0) {
            i = (c1-c2)/2 ;
            j = -c1+2*c2 ;
            k = 2048*c1-2048*c2+2047 ;
            l = -4096*c1+4096*c2+c4-4095 ;
            S2((c1-c2)/2,-c1+2*c2,2048*c1-2048*c2+2047,-4096*c1+4096*c2+c4-4095) ;
          }
        }
      }
      if ((c1 >= ceild(4096*c2+2*T-4095,4096)) && (c2 >= ceild(T-1023,1024))) {
        c3 = 2*T-1 ;
        for (c4=max(2*T+1,2048*c2);c4<=min(2048*c2+2047,N+2*T-3);c4++) {
          c5 = 1 ;
          if ((c1-c2)%2 == 0) {
            i = (c1-c2)/2 ;
            j = -c1+2*c2 ;
            k = T-1 ;
            l = c4-2*T+1 ;
            S2((c1-c2)/2,-c1+2*c2,T-1,c4-2*T+1) ;
          }
        }
      }
    }
  }
  clock_gettime(CLOCK_MONOTONIC, &tend);

  return ELAPSED_TIME_S(tend, tbegin);
}
Exemplo n.º 15
0
/* Generated from ../../../git/cloog/test/classen2.cloog by CLooG 0.14.0-271-gaa1e292 gmp bits in 0.14s. */
if ((M >= 2) && (N >= 3) && (outerProcTileScatter1 >= outerProcTileScatter2) && (5*outerProcTileScatter1 <= M+2*N-4) && (5*outerProcTileScatter1 <= 5*outerProcTileScatter2+N+2) && (outerProcTileScatter2 >= 0) && (5*outerProcTileScatter2 <= M+N-2) && (outerTimeTileScatter >= outerProcTileScatter1) && (outerTimeTileScatter <= 2*outerProcTileScatter1) && (outerTimeTileScatter <= outerProcTileScatter1+outerProcTileScatter2+1) && (5*outerTimeTileScatter <= 2*M+2*N-6) && (5*outerTimeTileScatter <= 5*outerProcTileScatter1+M+2) && (5*outerTimeTileScatter >= 10*outerProcTileScatter1-2*N-2) && (5*outerTimeTileScatter <= 5*outerProcTileScatter2+M+N) && (5*outerTimeTileScatter >= 10*outerProcTileScatter2-N-3) && (5*outerTimeTileScatter <= 10*outerProcTileScatter2+N+3) && (5*outerTimeTileScatter >= 5*outerProcTileScatter1+5*outerProcTileScatter2-N-4)) {
  for (compScatter1=max(max(max(max(max(4,5*outerTimeTileScatter),5*outerProcTileScatter2+1),5*outerProcTileScatter1+5*outerProcTileScatter2-N),10*outerProcTileScatter1-2*N+2),10*outerProcTileScatter2-N+1);compScatter1<=min(min(min(min(min(5*outerTimeTileScatter+4,2*M+2*N-6),5*outerProcTileScatter1+M+2),5*outerProcTileScatter1+5*outerProcTileScatter2+5),5*outerProcTileScatter2+M+N),10*outerProcTileScatter2+N+3);compScatter1++) {
    for (compScatter2=max(max(max(max(ceild(compScatter1+4,2),5*outerProcTileScatter1),5*outerProcTileScatter2+1),compScatter1-M+2),compScatter1-5*outerProcTileScatter2-1);compScatter2<=min(min(min(min(floord(compScatter1+2*N-2,2),compScatter1),5*outerProcTileScatter1+4),compScatter1-5*outerProcTileScatter2+N),5*outerProcTileScatter2+N+2);compScatter2++) {
      for (compScatter3=max(max(5*outerProcTileScatter2,compScatter1-compScatter2+3),compScatter2-N+2);compScatter3<=min(min(compScatter2-1,5*outerProcTileScatter2+4),compScatter1-compScatter2+N);compScatter3++) {
        S1(compScatter1-compScatter2+1,-compScatter1+compScatter2+compScatter3-2,compScatter2-compScatter3,compScatter1,compScatter2,compScatter3);
      }
    }
  }
}
Exemplo n.º 16
0
void test(int n)
{
  /* Scattering iterators. */
  int t1, t2, t3;
  /* Original iterators. */
  int i, j, k;
  if (n >= 1) {
    t1 = -n+1 ;
    t2 = n+1 ;
    for (t3=n+3;t3<=3*n+1;t3++) {
      if ((t3+n+1)%2 == 0) {
        k = (t3-n-1)/2 ;
        S1(1,n,(t3-n-1)/2) ;
      }
    }
  }
  if ((n >= 2) && (n <= 2)) {
    t1 = -n+2 ;
    for (t2=-n+4;t2<=3*n-2;t2++) {
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t2+n)%2 == 0) {
          i = (t2-n+2)/2 ;
          j = (t2+n-2)/2 ;
          if ((t3+n)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t2-n+2)/2,(t2+n-2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    t2 = n+3 ;
    for (t3=1;t3<=n;t3++) {
      S2(1,n,t3) ;
    }
  }
  if (n >= 3) {
    t1 = -n+2 ;
    for (t2=n;t2<=n+2;t2++) {
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t2+n)%2 == 0) {
          i = (t2-n+2)/2 ;
          j = (t2+n-2)/2 ;
          if ((t3+n)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t2-n+2)/2,(t2+n-2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    t2 = n+3 ;
    for (t3=1;t3<=n;t3++) {
      S2(1,n,t3) ;
    }
  }
  for (t1=ceild(-2*n+5,2);t1<=min(-n+6,-1);t1++) {
    for (t2=-t1+2;t2<=-t1+4;t2++) {
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=-t1+5;t2<=t1+2*n;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    t2 = t1+2*n+1 ;
    for (t3=1;t3<=n;t3++) {
      i = t1+n-1 ;
      S2(t1+n-1,n,t3) ;
    }
  }
  if (n == 2) {
    for (t3=5;t3<=7;t3++) {
      if ((t3+1)%2 == 0) {
        k = (t3-3)/2 ;
        S1(2,1,(t3-3)/2) ;
      }
    }
    for (t2=4;t2<=6;t2++) {
      for (t3=1;t3<=2;t3++) {
        if (t2%2 == 0) {
          i = (t2-2)/2 ;
          j = (t2-2)/2 ;
          S2((t2-2)/2,(t2-2)/2,t3) ;
        }
      }
    }
  }
  for (t1=-n+7;t1<=-1;t1++) {
    for (t2=-t1+2;t2<=-t1+4;t2++) {
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=-t1+5;t2<=n-2;t2++) {
      for (t3=1;t3<=t2+1;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
      for (t3=n+1;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=n-1;t2<=t1+2*n;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    t2 = t1+2*n+1 ;
    for (t3=1;t3<=n;t3++) {
      i = t1+n-1 ;
      S2(t1+n-1,n,t3) ;
    }
  }
  if (n >= 3) {
    for (t1=0;t1<=min(1,-n+6);t1++) {
      for (t2=t1+2;t2<=-t1+4;t2++) {
        for (t3=t2+2;t3<=t2+2*n;t3++) {
          if ((t1+t2)%2 == 0) {
            i = (t1+t2)/2 ;
            j = (-t1+t2)/2 ;
            if ((t1+t3)%2 == 0) {
              k = (-t2+t3)/2 ;
              S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
            }
          }
        }
      }
      for (t2=-t1+5;t2<=-t1+2*n;t2++) {
        for (t3=1;t3<=n;t3++) {
          if ((t1+t2+1)%2 == 0) {
            i = (t1+t2-3)/2 ;
            j = (-t1+t2-1)/2 ;
            S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
          }
        }
        for (t3=t2+2;t3<=t2+2*n;t3++) {
          if ((t1+t2)%2 == 0) {
            i = (t1+t2)/2 ;
            j = (-t1+t2)/2 ;
            if ((t1+t3)%2 == 0) {
              k = (-t2+t3)/2 ;
              S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
            }
          }
        }
      }
      for (t2=-t1+2*n+1;t2<=t1+2*n+1;t2++) {
        for (t3=1;t3<=n;t3++) {
          if ((t1+t2+1)%2 == 0) {
            i = (t1+t2-3)/2 ;
            j = (-t1+t2-1)/2 ;
            S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
          }
        }
      }
    }
  }
  for (t1=max(-n+7,0);t1<=1;t1++) {
    for (t2=t1+2;t2<=-t1+4;t2++) {
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=-t1+5;t2<=n-2;t2++) {
      for (t3=1;t3<=t2+1;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
      for (t3=n+1;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=n-1;t2<=-t1+2*n;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=-t1+2*n+1;t2<=t1+2*n+1;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
    }
  }
  for (t1=2;t1<=n-5;t1++) {
    t2 = t1+2 ;
    for (t3=t1+4;t3<=t1+2*n+2;t3++) {
      i = t1+1 ;
      if ((t1+t3)%2 == 0) {
        k = (-t1+t3-2)/2 ;
        S1(t1+1,1,(-t1+t3-2)/2) ;
      }
    }
    for (t2=t1+3;t2<=n-2;t2++) {
      for (t3=1;t3<=t2+1;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
      for (t3=n+1;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=n-1;t2<=-t1+2*n;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=-t1+2*n+1;t2<=-t1+2*n+3;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
    }
  }
  for (t1=max(2,n-4);t1<=floord(2*n-3,2);t1++) {
    t2 = t1+2 ;
    for (t3=t1+4;t3<=t1+2*n+2;t3++) {
      i = t1+1 ;
      if ((t1+t3)%2 == 0) {
        k = (-t1+t3-2)/2 ;
        S1(t1+1,1,(-t1+t3-2)/2) ;
      }
    }
    for (t2=t1+3;t2<=-t1+2*n;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
      for (t3=t2+2;t3<=t2+2*n;t3++) {
        if ((t1+t2)%2 == 0) {
          i = (t1+t2)/2 ;
          j = (-t1+t2)/2 ;
          if ((t1+t3)%2 == 0) {
            k = (-t2+t3)/2 ;
            S1((t1+t2)/2,(-t1+t2)/2,(-t2+t3)/2) ;
          }
        }
      }
    }
    for (t2=-t1+2*n+1;t2<=-t1+2*n+3;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t1+t2+1)%2 == 0) {
          i = (t1+t2-3)/2 ;
          j = (-t1+t2-1)/2 ;
          S2((t1+t2-3)/2,(-t1+t2-1)/2,t3) ;
        }
      }
    }
  }
  if (n >= 3) {
    t1 = n-1 ;
    t2 = n+1 ;
    for (t3=n+3;t3<=3*n+1;t3++) {
      if ((t3+n+1)%2 == 0) {
        k = (t3-n-1)/2 ;
        S1(n,1,(t3-n-1)/2) ;
      }
    }
    for (t2=n+2;t2<=n+4;t2++) {
      for (t3=1;t3<=n;t3++) {
        if ((t2+n)%2 == 0) {
          i = (t2+n-4)/2 ;
          j = (t2-n)/2 ;
          S2((t2+n-4)/2,(t2-n)/2,t3) ;
        }
      }
    }
  }
  if (n >= 1) {
    t2 = n+3 ;
    for (t3=1;t3<=n;t3++) {
      S2(n,1,t3) ;
    }
  }
}
Exemplo n.º 17
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  for (annot_i=0; annot_i<REPS; annot_i++)
    {
      annot_t_start = rtclock();


/*@ begin PerfTuning (         
  def build 
  { 
    arg build_command = 'icc -O3 -openmp -I/usr/local/icc/include -lm'; 
  } 
    
  def performance_counter          
  { 
    arg repetitions = 1; 
  }

  def performance_params 
  {
#    param T1_1[] = [1,16,32,64,128];
#    param T1_2[] = [1,16,32,64,128];
#    param T1_3[] = [1,16,32,64,128];
#    param T2_1[] = [1,4,8,16,32];
#    param T2_2[] = [1,4,8,16,32];
#    param T2_3[] = [1,4,8,16,32];

    param T1_1[] = [64];
    param T1_2[] = [256];
    param T1_3[] = [64];
    param T2_1[] = [1];
    param T2_2[] = [1];
    param T2_3[] = [1];

    constraint c1 = (T1_1*T2_1<=1024 and T1_1*T2_1<=1024 and T1_1*T2_1<=1024);
    constraint c2 = ((T1_1 == T1_3) and (T2_1 == T2_3));

    param U1[] = [1];
    param U2[] = [1];
    param U3[] = [7];

    constraint c3 = (U1*U2*U3<=512);

    param PERM[] = [
      #[0,1,2],
      #[0,2,1],
      #[1,0,2],
      #[1,2,0],
      [2,0,1],
      #[2,1,0],
      ];

    param PAR[] = [True];
    param SCREP[] = [False];
    param IVEC[] = [True];
  }

  def search 
  { 
    arg algorithm = 'Exhaustive'; 
#    arg algorithm = 'Simplex'; 
#    arg time_limit = 5;
#    arg total_runs = 1;
  } 
   
  def input_params 
  {
    param N[] = [1024];
  }

  def input_vars
  {
    arg decl_file = 'decl_code.h';
    arg init_file = 'init_code.c';
  }
) @*/

/**-- (Generated by Orio) 
Best performance cost: 
  0.201184 
Tuned for specific problem sizes: 
  N = 1024 
Best performance parameters: 
  IVEC = True 
  PAR = True 
  PERM = [2, 0, 1] 
  SCREP = False 
  T1_1 = 64 
  T1_2 = 256 
  T1_3 = 64 
  T2_1 = 1 
  T2_2 = 1 
  T2_3 = 1 
  U1 = 1 
  U2 = 1 
  U3 = 7 
--**/

 

register int i,j,k;
register int c1t, c2t, c3t, c4t, c5t, c6t, c7t, c8t, c9t, c10t, c11t, c12t;
register int newlb_c1, newlb_c2, newlb_c3, newlb_c4, newlb_c5, newlb_c6,
  newlb_c7, newlb_c8, newlb_c9, newlb_c10, newlb_c11, newlb_c12;
register int newub_c1, newub_c2, newub_c3, newub_c4, newub_c5, newub_c6,
  newub_c7, newub_c8, newub_c9, newub_c10, newub_c11, newub_c12;


/*@ begin PolySyn(    
  parallel = PAR;
  tiles = [T1_1,T1_2,T1_3,T2_1,T2_2,T2_3];
  permut = PERM;
  unroll_factors = [U1,U2,U3];
  scalar_replace = SCREP;
  vectorize = IVEC;
    
  profiling_code = 'lu_profiling.c';
  compile_cmd = 'gcc';
  compile_opts = '-lm';
  ) @*/

#include <math.h>
#include <assert.h>

#define ceild(n,d)  ceil(((double)(n))/((double)(d)))
#define floord(n,d) floor(((double)(n))/((double)(d)))
#define max(x,y)    ((x) > (y)? (x) : (y))
#define min(x,y)    ((x) < (y)? (x) : (y))

		
	int c1, c2, c3, c4, c5, c6, c7, c8, c9;

	register int lb, ub, lb1, ub1, lb2, ub2;
/* Generated from PLuTo-produced CLooG file by CLooG v0.14.1 64 bits in 0.05s. */
for (c1=-1;c1<=floord(5*N-9,256);c1++) {
	lb1=max(max(ceild(32*c1-127,160),ceild(64*c1-N+2,64)),0);
	ub1=min(floord(64*c1+63,64),floord(N-1,256));
#pragma omp parallel for shared(c1,lb1,ub1) private(c2,c3,c4,c5,c6,c7,c8,c9)
	for (c2=lb1; c2<=ub1; c2++) {
    for (c3=max(ceild(32*c1-32*c2-1953,2016),ceild(32*c1-32*c2-31,32));c3<=floord(N-1,64);c3++) {
      if (c1 == c2+c3) {
        for (c7=max(64*c3,0);c7<=min(min(N-2,64*c3+62),256*c2+254);c7++) {
          for (c8=max(c7+1,256*c2);c8<=min(N-1,256*c2+255);c8++) {
            A[c7][c8]=A[c7][c8]/A[c7][c7] ;
            for (c9=c7+1;c9<=min(N-1,64*c3+63);c9++) {
              A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8] ;
            }
          }
        }
      }
/*@ begin Loop(
transform Composite(
permut = [['c9', 'c7', 'c8']],
  regtile = (['c7', 'c8', 'c9'],[1, 1, 7]),
  scalarreplace = (False, 'double'),
  vector = (True, ['ivdep','vector always']))

      for (c7=max(0,64*c1-64*c2);c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1);c7++) {
        for (c8=max(c7+1,256*c2);c8<=min(256*c2+255,N-1);c8++) {
          for (c9=64*c3;c9<=min(N-1,64*c3+63);c9++) {
            A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8] ;
          }
        }
      }

) @*/{
  for (c9t=64*c3; c9t<=min(N-1,64*c3+63)-6; c9t=c9t+7) {
    for (c7=max(0,64*c1-64*c2); c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1); c7++ ) {
      register int cbv_1, cbv_2;
      cbv_1=max(c7+1,256*c2);
      cbv_2=min(256*c2+255,N-1);
#pragma ivdep
#pragma vector always
      for (c8=cbv_1; c8<=cbv_2; c8++ ) {
        A[c9t][c8]=A[c9t][c8]-A[c9t][c7]*A[c7][c8];
        A[(c9t+1)][c8]=A[(c9t+1)][c8]-A[(c9t+1)][c7]*A[c7][c8];
        A[(c9t+2)][c8]=A[(c9t+2)][c8]-A[(c9t+2)][c7]*A[c7][c8];
        A[(c9t+3)][c8]=A[(c9t+3)][c8]-A[(c9t+3)][c7]*A[c7][c8];
        A[(c9t+4)][c8]=A[(c9t+4)][c8]-A[(c9t+4)][c7]*A[c7][c8];
        A[(c9t+5)][c8]=A[(c9t+5)][c8]-A[(c9t+5)][c7]*A[c7][c8];
        A[(c9t+6)][c8]=A[(c9t+6)][c8]-A[(c9t+6)][c7]*A[c7][c8];
      }
    }
  }
  for (c9=c9t; c9<=min(N-1,64*c3+63); c9=c9+1) {
    for (c7=max(0,64*c1-64*c2); c7<=min(min(256*c2+254,64*c1-64*c2+63),64*c3-1); c7++ ) {
      register int cbv_3, cbv_4;
      cbv_3=max(c7+1,256*c2);
      cbv_4=min(256*c2+255,N-1);
#pragma ivdep
#pragma vector always
      for (c8=cbv_3; c8<=cbv_4; c8++ ) {
        A[c9][c8]=A[c9][c8]-A[c9][c7]*A[c7][c8];
      }
    }
  }
}
/*@ end @*/

      if ((-c1 == -c2-c3) && (c1 <= min(floord(320*c2+191,64),floord(64*c2+N-65,64)))) {
        for (c8=max(256*c2,64*c1-64*c2+64);c8<=min(256*c2+255,N-1);c8++) {
          A[64*c1-64*c2+63][c8]=A[64*c1-64*c2+63][c8]/A[64*c1-64*c2+63][64*c1-64*c2+63] ;
        }
      }
    }
  }
}
/* End of CLooG code */

/*@ end @*/
/*@ end @*/




      annot_t_end = rtclock();
      annot_t_total += annot_t_end - annot_t_start;
    }

  annot_t_total = annot_t_total / REPS;

#ifndef TEST
  printf("%f\n", annot_t_total);
#else
  {
    int i, j;
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        if (j%100==0)
          printf("\n");
        printf("%f ",A[i][j]);
      }
      printf("\n");
    }
  }
#endif

  return ((int) A[0][0]);

}