Beispiel #1
0
void rollback( const unsigned g, PrivGlobs& globs ) {
    unsigned numX = globs.myXsize,
             numY = globs.myYsize;

    unsigned numZ = max(numX,numY);
    unsigned i, j;

    REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]);

    REAL *u = (REAL*) malloc(numY*numX*sizeof(REAL));           // [numY][numX]
    REAL *uT = (REAL*) malloc(numX*numY*sizeof(REAL));          // [numX][numY]
    REAL *v = (REAL*) malloc(numX*numY*sizeof(REAL));           // [numX][numY]
    REAL *y = (REAL*) malloc(numX*numY*sizeof(REAL));           // [numX][numZ]
    REAL *yy = (REAL*) malloc(numY*sizeof(REAL));           // [max(numX,numY)]


    for(i=0;i<numX;i++) { //par
        for(j=0;j<numY;j++) { //par
            //TODO: This can be combined in the tridag kernel, in shared mem.

            uT[idx2d(i,j,numY)] = dtInv*globs.myResult[idx2d(i,j,globs.myResultCols)];
            if(i > 0) {
                uT[idx2d(i,j,numY)] += 0.5*( 0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,0,globs.myDxxCols)] )
                            * globs.myResult[idx2d(i-1,j,globs.myResultCols)];
            }
            uT[idx2d(i,j,numY)]  += 0.5*( 0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,1,globs.myDxxCols)] )
                            * globs.myResult[idx2d(i,j,globs.myResultCols)];
            if(i < numX-1) {
                uT[idx2d(i,j,numY)] += 0.5*( 0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,2,globs.myDxxCols)] )
                            * globs.myResult[idx2d(i+1,j,globs.myResultCols)];
            }
        }
    }

  
    for(i=0;i<numX;i++) { //par
        for(j=0;j<numY;j++) { //par
            //TODO: This can be combined in the tridag kernel too, as parameters.
            v[idx2d(i,j,numY)] = 0.0;
            if(j > 0) {
              v[idx2d(i,j,numY)] +=  ( 0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,0,globs.myDyyCols)])
                         *  globs.myResult[idx2d(i,j-1,globs.myResultCols)];
            }
            v[idx2d(i,j,numY)]  +=  ( 0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,1,globs.myDyyCols)])
                         *  globs.myResult[idx2d(i,j,globs.myResultCols)];
            if(j < numY-1) {
              v[idx2d(i,j,numY)] += ( 0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,2,globs.myDyyCols)])
                         *  globs.myResult[idx2d(i,j+1,globs.myResultCols)];
            }
            uT[idx2d(i,j,numY)] += v[idx2d(i,j,numY)];
        }
    }

    transpose2d(uT, &u, numY, numX);

    REAL *a = (REAL*) malloc(numY*numX*sizeof(REAL));           // [numY][numZ]
    REAL *b = (REAL*) malloc(numY*numX*sizeof(REAL));           // [numY][numZ]
    REAL *c = (REAL*) malloc(numY*numX*sizeof(REAL));           // [numY][numZ]
    REAL *aT = (REAL*) malloc(numX*numY*sizeof(REAL));           // [numZ][numY]
    REAL *bT = (REAL*) malloc(numX*numY*sizeof(REAL));           // [numZ][numY]
    REAL *cT = (REAL*) malloc(numX*numY*sizeof(REAL));           // [numZ][numY]

    for(i=0;i<numX;i++) {  // par // here a, b,c should have size [numX]
        for(j=0;j<numY;j++) { // par
            aT[idx2d(i,j,numY)] =    - 0.5*(0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,0,globs.myDxxCols)]);
            bT[idx2d(i,j,numY)] = dtInv
                            - 0.5*(0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,1,globs.myDxxCols)]);
            cT[idx2d(i,j,numY)] =    - 0.5*(0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,2,globs.myDxxCols)]);
        }
    }
    transpose2d(aT, &a, numY, numX);
    transpose2d(bT, &b, numY, numX);
    transpose2d(cT, &c, numY, numX);

    for(j=0;j<numY;j++) { // par
        // here yy should have size [numX]
        tridagPar(&a[idx2d(j,0,numX)], &b[idx2d(j,0,numX)], &c[idx2d(j,0,numX)]
                 ,&u[idx2d(j,0,numX)],numX,&u[idx2d(j,0,numX)],&yy[0]);
    }


    for(i=0;i<numX;i++) { // par
        // parallelizable via loop distribution / array expansion.
        for(j=0;j<numY;j++) { // par  // here a, b, c should have size [numY]
            aT[idx2d(i,j,numY)] =       - 0.5*(0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,0,globs.myDyyCols)]);
            bT[idx2d(i,j,numY)] = dtInv - 0.5*(0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,1,globs.myDyyCols)]);
            cT[idx2d(i,j,numY)] =       - 0.5*(0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,2,globs.myDyyCols)]);
        }
    }
    transpose2d(aT, &a, numY, numX);
    transpose2d(bT, &b, numY, numX);
    transpose2d(cT, &c, numY, numX);

    transpose2d(u, &uT, numX, numY); //Must retranspose to uT because prev tridag
                                     // modified u.
    // Coalesced memory acces.
    for(i=0;i<numX;i++) { // par
        for(j=0;j<numY;j++) { // par
            y[idx2d(i,j,numY)] = dtInv * uT[idx2d(i,j,numY)]
                               - 0.5*v[idx2d(i,j,numY)];
        }
    }


    for(i=0;i<numX;i++) { // par
        // here yy should have size [numX]

        tridagPar(&aT[idx2d(i,0,numY)], &bT[idx2d(i,0,numY)],
                  &cT[idx2d(i,0,numY)], &y[idx2d(i,0,numY)], numY,
                  &globs.myResult[idx2d(i,0,globs.myResultCols)],&yy[0]);
                  //&globs.myResult[idx2d(i,0, globs.myResultCols)],&yy[0]);
    }

    free(u);
    free(uT);
    free(v);
    free(y);
    free(yy);
    free(a);
    free(b);
    free(c);
    free(aT);
    free(bT);
    free(cT);
}
Beispiel #2
0
void
rollback( const unsigned g, PrivGlobs& globs, int outer, const int& numX, 
          const int& numY) 
{
  unsigned numZ = max(numX,numY);
  unsigned numM = numX * numY;

  REAL* u = (REAL*) malloc(sizeof(REAL) * outer * numY * numX);   // [outer][numY][numX]
  REAL* v = (REAL*) malloc(sizeof(REAL) * outer * numX * numY);   // [outer][numX][numY]
  REAL* ax = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numY][numX]
  REAL* bx = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numY][numX]
  REAL* cx = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numY][numX]
  REAL* ay = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY]
  REAL* by = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY]
  REAL* cy = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY]
  REAL* y = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numZ][numZ]
  REAL* yy = (REAL*) malloc(sizeof(REAL)*outer*numZ); // [outer][numZ]

// X-loop
#pragma omp parallel for default(shared) schedule(static) if(outer>8)
  for (int o = 0; o < outer; o++) 
  {
    REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]);
    for(int j=0;j<numY;j++) 
    {
      for(int i=0;i<numX;i++) 
      {
        // implicit x
        ax[o * numX * numY + j * numX + i] = 
          -0.5*(0.5*globs.myVarX[o * numM + j * numX + i]
                   *globs.myDxx[i * 4 + 0]);
        bx[o * numX * numY + j * numX + i] = 
          dtInv - 0.5*(0.5*globs.myVarX[o * numM + j * numX + i]
                          *globs.myDxx[i * 4 + 1]);
        cx[o * numX * numY + j * numX + i] =
          -0.5*(0.5*globs.myVarX[o * numM + j * numX + i]
                   *globs.myDxx[i * 4 + 2]);
        //	explicit x
        u[o * numX * numY + j * numX + i] = 
          dtInv*globs.myResult[o * numM + i * numY + j];
        if(i > 0) 
        {
          u[o * numX * numY + j * numX + i] += 
            0.5*(0.5*globs.myVarX[o * numM + + j * numX + i]
                    *globs.myDxx[i * 4 + 0])
                    *globs.myResult[o * numM + (i-1) * numY + j];
        }
        u[o * numX * numY + j * numX + i]  +=  
          0.5*(0.5*globs.myVarX[o * numM + + j * numX + i]*globs.myDxx[i * 4 + 1])
                  *globs.myResult[o * numM + i * numY + j];
        if(i < numX-1) 
        {
          u[o * numX * numY + j * numX + i] += 
            0.5*(0.5*globs.myVarX[o * numM + + j * numX + i]
                    *globs.myDxx[i * 4 + 2])
                    *globs.myResult[o * numM + (i+1) * numY + j];
        }
      }
    }
  }
#pragma omp parallel for default(shared) schedule(static) if(outer>8)
  for (int o = 0; o < outer; o++) 
  {
    REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]);
    unsigned i, j;
    // Y-Loop
    for(i=0;i<numX;i++) 
    {
      for(j=0;j<numY;j++)
        {
          // Explicit y
          v[o * numX * numY + i * numY + j] = 0.0;
          if(j > 0) 
          {
            v[o * numX * numY + i * numY + j] +=  
              (0.5*globs.myVarY[o * numM + i * numY + j]*globs.myDyy[j * 4 + 0])
                  *globs.myResult[o * numM + i * numY + j-1];
          }

          v[o * numX * numY + i * numY + j] += 
            (0.5*globs.myVarY[o * numM + i * numY + j]
                *globs.myDyy[j * 4 + 1])
                *globs.myResult[o * numM + i  * numY + j];

          if(j < numY-1) 
          {
            v[o * numX * numY + i * numY + j] +=  
              (0.5*globs.myVarY[o * numM + i * numY + j]
                  *globs.myDyy[j * 4 + 2])
                  *globs.myResult[o * numM + i * numY + j+1];
          }
          u[o * numX * numY + j * numX + i] += v[o * numX * numY + i * numY + j];
          // Implicit y
          ay[o * numX * numY + i * numY + j] =
            -0.5*(0.5*globs.myVarY[o * numM + i * numY + j]
                     *globs.myDyy[j * 4 + 0]);
          by[o * numX * numY + i * numY + j] = 
            dtInv - 0.5*(0.5*globs.myVarY[o * numM + i * numY + j]
                            *globs.myDyy[j * 4 + 1]);
          cy[o * numX * numY + i * numY + j] =
            -0.5*(0.5*globs.myVarY[o * numM + i * numY + j]
                     *globs.myDyy[j * 4 + 2]);
        }
    }
  }
#pragma omp parallel for default(shared) schedule(static) if(outer>8)
  for(int o = 0; o < outer; o++) 
  {
    for(int j=0;j<numY;j++) 
      {
      // here yy should have size [numX]
      tridagPar(&ax[o * numX * numY + j * numX],&bx[o * numX * numY + j * numX],
                &cx[o * numX * numY + j * numX], &u[o * numX * numY + j * numX], 
                numX, &u[o * numX * numY + numX * j], &yy[o*numZ]);
    }
  }

  //	implicit y
#pragma omp parallel for default(shared) schedule(static) if(outer>8)
  for(int o = 0; o < outer; o++)
  {
    REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]);
    for(int i=0;i<numX;i++)
    {
      for(int j=0;j<numY;j++)
      {  // here a, b, c should have size [numY]
        y[o * numX * numY + i * numY + j] =
          dtInv*u[o * numX * numY + j * numX + i]
           -0.5*v[o * numX * numY + i * numY + j];
      }
    }
  }
#pragma omp parallel for default(shared) schedule(static) if(outer>8)
  for(int o = 0; o < outer; o++)
  {
    for(int i=0;i<numX;i++)
    {
      // here yy should have size [numY]
      tridagPar(&ay[o * numX * numY + i * numY],&by[o * numX * numY + i * numY],
                &cy[o * numX * numY + i * numY],&y[o * numX * numY + i * numY],
                numY,&globs.myResult[o * numM + i * numY],&yy[o*numZ]);
    }
  }
  free(u);
  free(ax);
  free(ay);
  free(bx);
  free(by);
  free(cx);
  free(cy);
  free(yy);
  free(y);
}