void rollback( const unsigned g, PrivGlobs& globs ) { unsigned numX = globs.myXsize, numY = globs.myYsize; unsigned numZ = max(numX,numY); unsigned i, j; REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]); REAL *u = (REAL*) malloc(numY*numX*sizeof(REAL)); // [numY][numX] REAL *uT = (REAL*) malloc(numX*numY*sizeof(REAL)); // [numX][numY] REAL *v = (REAL*) malloc(numX*numY*sizeof(REAL)); // [numX][numY] REAL *y = (REAL*) malloc(numX*numY*sizeof(REAL)); // [numX][numZ] REAL *yy = (REAL*) malloc(numY*sizeof(REAL)); // [max(numX,numY)] for(i=0;i<numX;i++) { //par for(j=0;j<numY;j++) { //par //TODO: This can be combined in the tridag kernel, in shared mem. uT[idx2d(i,j,numY)] = dtInv*globs.myResult[idx2d(i,j,globs.myResultCols)]; if(i > 0) { uT[idx2d(i,j,numY)] += 0.5*( 0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,0,globs.myDxxCols)] ) * globs.myResult[idx2d(i-1,j,globs.myResultCols)]; } uT[idx2d(i,j,numY)] += 0.5*( 0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,1,globs.myDxxCols)] ) * globs.myResult[idx2d(i,j,globs.myResultCols)]; if(i < numX-1) { uT[idx2d(i,j,numY)] += 0.5*( 0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,2,globs.myDxxCols)] ) * globs.myResult[idx2d(i+1,j,globs.myResultCols)]; } } } for(i=0;i<numX;i++) { //par for(j=0;j<numY;j++) { //par //TODO: This can be combined in the tridag kernel too, as parameters. v[idx2d(i,j,numY)] = 0.0; if(j > 0) { v[idx2d(i,j,numY)] += ( 0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,0,globs.myDyyCols)]) * globs.myResult[idx2d(i,j-1,globs.myResultCols)]; } v[idx2d(i,j,numY)] += ( 0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,1,globs.myDyyCols)]) * globs.myResult[idx2d(i,j,globs.myResultCols)]; if(j < numY-1) { v[idx2d(i,j,numY)] += ( 0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,2,globs.myDyyCols)]) * globs.myResult[idx2d(i,j+1,globs.myResultCols)]; } uT[idx2d(i,j,numY)] += v[idx2d(i,j,numY)]; } } transpose2d(uT, &u, numY, numX); REAL *a = (REAL*) malloc(numY*numX*sizeof(REAL)); // [numY][numZ] REAL *b = (REAL*) malloc(numY*numX*sizeof(REAL)); // [numY][numZ] REAL *c = (REAL*) malloc(numY*numX*sizeof(REAL)); // [numY][numZ] REAL *aT = (REAL*) malloc(numX*numY*sizeof(REAL)); // [numZ][numY] REAL *bT = (REAL*) malloc(numX*numY*sizeof(REAL)); // [numZ][numY] REAL *cT = (REAL*) malloc(numX*numY*sizeof(REAL)); // [numZ][numY] for(i=0;i<numX;i++) { // par // here a, b,c should have size [numX] for(j=0;j<numY;j++) { // par aT[idx2d(i,j,numY)] = - 0.5*(0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,0,globs.myDxxCols)]); bT[idx2d(i,j,numY)] = dtInv - 0.5*(0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,1,globs.myDxxCols)]); cT[idx2d(i,j,numY)] = - 0.5*(0.5*globs.myVarX[idx2d(i,j,globs.myVarXCols)]*globs.myDxx[idx2d(i,2,globs.myDxxCols)]); } } transpose2d(aT, &a, numY, numX); transpose2d(bT, &b, numY, numX); transpose2d(cT, &c, numY, numX); for(j=0;j<numY;j++) { // par // here yy should have size [numX] tridagPar(&a[idx2d(j,0,numX)], &b[idx2d(j,0,numX)], &c[idx2d(j,0,numX)] ,&u[idx2d(j,0,numX)],numX,&u[idx2d(j,0,numX)],&yy[0]); } for(i=0;i<numX;i++) { // par // parallelizable via loop distribution / array expansion. for(j=0;j<numY;j++) { // par // here a, b, c should have size [numY] aT[idx2d(i,j,numY)] = - 0.5*(0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,0,globs.myDyyCols)]); bT[idx2d(i,j,numY)] = dtInv - 0.5*(0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,1,globs.myDyyCols)]); cT[idx2d(i,j,numY)] = - 0.5*(0.5*globs.myVarY[idx2d(i,j,globs.myVarYCols)]*globs.myDyy[idx2d(j,2,globs.myDyyCols)]); } } transpose2d(aT, &a, numY, numX); transpose2d(bT, &b, numY, numX); transpose2d(cT, &c, numY, numX); transpose2d(u, &uT, numX, numY); //Must retranspose to uT because prev tridag // modified u. // Coalesced memory acces. for(i=0;i<numX;i++) { // par for(j=0;j<numY;j++) { // par y[idx2d(i,j,numY)] = dtInv * uT[idx2d(i,j,numY)] - 0.5*v[idx2d(i,j,numY)]; } } for(i=0;i<numX;i++) { // par // here yy should have size [numX] tridagPar(&aT[idx2d(i,0,numY)], &bT[idx2d(i,0,numY)], &cT[idx2d(i,0,numY)], &y[idx2d(i,0,numY)], numY, &globs.myResult[idx2d(i,0,globs.myResultCols)],&yy[0]); //&globs.myResult[idx2d(i,0, globs.myResultCols)],&yy[0]); } free(u); free(uT); free(v); free(y); free(yy); free(a); free(b); free(c); free(aT); free(bT); free(cT); }
void rollback( const unsigned g, PrivGlobs& globs, int outer, const int& numX, const int& numY) { unsigned numZ = max(numX,numY); unsigned numM = numX * numY; REAL* u = (REAL*) malloc(sizeof(REAL) * outer * numY * numX); // [outer][numY][numX] REAL* v = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY] REAL* ax = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numY][numX] REAL* bx = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numY][numX] REAL* cx = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numY][numX] REAL* ay = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY] REAL* by = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY] REAL* cy = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numX][numY] REAL* y = (REAL*) malloc(sizeof(REAL) * outer * numX * numY); // [outer][numZ][numZ] REAL* yy = (REAL*) malloc(sizeof(REAL)*outer*numZ); // [outer][numZ] // X-loop #pragma omp parallel for default(shared) schedule(static) if(outer>8) for (int o = 0; o < outer; o++) { REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]); for(int j=0;j<numY;j++) { for(int i=0;i<numX;i++) { // implicit x ax[o * numX * numY + j * numX + i] = -0.5*(0.5*globs.myVarX[o * numM + j * numX + i] *globs.myDxx[i * 4 + 0]); bx[o * numX * numY + j * numX + i] = dtInv - 0.5*(0.5*globs.myVarX[o * numM + j * numX + i] *globs.myDxx[i * 4 + 1]); cx[o * numX * numY + j * numX + i] = -0.5*(0.5*globs.myVarX[o * numM + j * numX + i] *globs.myDxx[i * 4 + 2]); // explicit x u[o * numX * numY + j * numX + i] = dtInv*globs.myResult[o * numM + i * numY + j]; if(i > 0) { u[o * numX * numY + j * numX + i] += 0.5*(0.5*globs.myVarX[o * numM + + j * numX + i] *globs.myDxx[i * 4 + 0]) *globs.myResult[o * numM + (i-1) * numY + j]; } u[o * numX * numY + j * numX + i] += 0.5*(0.5*globs.myVarX[o * numM + + j * numX + i]*globs.myDxx[i * 4 + 1]) *globs.myResult[o * numM + i * numY + j]; if(i < numX-1) { u[o * numX * numY + j * numX + i] += 0.5*(0.5*globs.myVarX[o * numM + + j * numX + i] *globs.myDxx[i * 4 + 2]) *globs.myResult[o * numM + (i+1) * numY + j]; } } } } #pragma omp parallel for default(shared) schedule(static) if(outer>8) for (int o = 0; o < outer; o++) { REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]); unsigned i, j; // Y-Loop for(i=0;i<numX;i++) { for(j=0;j<numY;j++) { // Explicit y v[o * numX * numY + i * numY + j] = 0.0; if(j > 0) { v[o * numX * numY + i * numY + j] += (0.5*globs.myVarY[o * numM + i * numY + j]*globs.myDyy[j * 4 + 0]) *globs.myResult[o * numM + i * numY + j-1]; } v[o * numX * numY + i * numY + j] += (0.5*globs.myVarY[o * numM + i * numY + j] *globs.myDyy[j * 4 + 1]) *globs.myResult[o * numM + i * numY + j]; if(j < numY-1) { v[o * numX * numY + i * numY + j] += (0.5*globs.myVarY[o * numM + i * numY + j] *globs.myDyy[j * 4 + 2]) *globs.myResult[o * numM + i * numY + j+1]; } u[o * numX * numY + j * numX + i] += v[o * numX * numY + i * numY + j]; // Implicit y ay[o * numX * numY + i * numY + j] = -0.5*(0.5*globs.myVarY[o * numM + i * numY + j] *globs.myDyy[j * 4 + 0]); by[o * numX * numY + i * numY + j] = dtInv - 0.5*(0.5*globs.myVarY[o * numM + i * numY + j] *globs.myDyy[j * 4 + 1]); cy[o * numX * numY + i * numY + j] = -0.5*(0.5*globs.myVarY[o * numM + i * numY + j] *globs.myDyy[j * 4 + 2]); } } } #pragma omp parallel for default(shared) schedule(static) if(outer>8) for(int o = 0; o < outer; o++) { for(int j=0;j<numY;j++) { // here yy should have size [numX] tridagPar(&ax[o * numX * numY + j * numX],&bx[o * numX * numY + j * numX], &cx[o * numX * numY + j * numX], &u[o * numX * numY + j * numX], numX, &u[o * numX * numY + numX * j], &yy[o*numZ]); } } // implicit y #pragma omp parallel for default(shared) schedule(static) if(outer>8) for(int o = 0; o < outer; o++) { REAL dtInv = 1.0/(globs.myTimeline[g+1]-globs.myTimeline[g]); for(int i=0;i<numX;i++) { for(int j=0;j<numY;j++) { // here a, b, c should have size [numY] y[o * numX * numY + i * numY + j] = dtInv*u[o * numX * numY + j * numX + i] -0.5*v[o * numX * numY + i * numY + j]; } } } #pragma omp parallel for default(shared) schedule(static) if(outer>8) for(int o = 0; o < outer; o++) { for(int i=0;i<numX;i++) { // here yy should have size [numY] tridagPar(&ay[o * numX * numY + i * numY],&by[o * numX * numY + i * numY], &cy[o * numX * numY + i * numY],&y[o * numX * numY + i * numY], numY,&globs.myResult[o * numM + i * numY],&yy[o*numZ]); } } free(u); free(ax); free(ay); free(bx); free(by); free(cx); free(cy); free(yy); free(y); }