double InitCPU3d(Mesh *mesh, int Nfields){ printf("Np = %d, BSIZE = %d\n", p_Np, BSIZE); /* Q */ int sz = mesh->K*(p_Np)*Nfields*sizeof(float); /* TW BLOCK */ mesh->f_Q = (float*) calloc(mesh->K*p_Np*Nfields, sizeof(float)); mesh->f_rhsQ = (float*) calloc(mesh->K*p_Np*Nfields, sizeof(float)); mesh->f_resQ = (float*) calloc(mesh->K*p_Np*Nfields, sizeof(float)); /* float LIFT */ sz = p_Np*(p_Nfp)*(p_Nfaces)*sizeof(float); mesh->f_LIFT = (float*) malloc(sz); int sk = 0, n, m, f, k; for(n=0;n<p_Np;++n){ for(m=0;m<p_Nfp*p_Nfaces;++m){ mesh->f_LIFT[sk++] = mesh->LIFT[n][m]; } } /* float Dr & Ds */ sz = p_Np*p_Np*sizeof(float); mesh->f_Dr = (float*) malloc(sz); mesh->f_Ds = (float*) malloc(sz); mesh->f_Dt = (float*) malloc(sz); sk = 0; for(n=0;n<p_Np;++n){ for(m=0;m<p_Np;++m){ mesh->f_Dr[sk] = mesh->Dr[n][m]; mesh->f_Ds[sk] = mesh->Ds[n][m]; mesh->f_Dt[sk] = mesh->Dt[n][m]; ++sk; } } /* vgeo */ double drdx, dsdx, dtdx; double drdy, dsdy, dtdy; double drdz, dsdz, dtdz, J; mesh->vgeo = (float*) calloc(12*mesh->K, sizeof(float)); for(k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); mesh->vgeo[k*12+0] = drdx; mesh->vgeo[k*12+1] = drdy; mesh->vgeo[k*12+2] = drdz; mesh->vgeo[k*12+4] = dsdx; mesh->vgeo[k*12+5] = dsdy; mesh->vgeo[k*12+6] = dsdz; mesh->vgeo[k*12+8] = dtdx; mesh->vgeo[k*12+9] = dtdy; mesh->vgeo[k*12+10] = dtdz; } /* surfinfo (vmapM, vmapP, Fscale, Bscale, nx, ny, nz, 0) */ sz = mesh->K*p_Nfp*p_Nfaces*7*sizeof(float); mesh->surfinfo = (float*) malloc(sz); /* local-local info */ sk = 0; int skP = -1; double *nxk = BuildVector(mesh->Nfaces); double *nyk = BuildVector(mesh->Nfaces); double *nzk = BuildVector(mesh->Nfaces); double *sJk = BuildVector(mesh->Nfaces); double dt = 1e6; sk = 0; for(k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); Normals3d(mesh, k, nxk, nyk, nzk, sJk); for(f=0;f<mesh->Nfaces;++f){ dt = min(dt, J/sJk[f]); for(m=0;m<p_Nfp;++m){ int id = m + f*p_Nfp + p_Nfp*p_Nfaces*k; int idM = mesh->vmapM[id]; int idP = mesh->vmapP[id]; int nM = idM%p_Np; int nP = idP%p_Np; int kM = (idM-nM)/p_Np; int kP = (idP-nP)/p_Np; idM = Nfields*(nM + p_Np*kM); idP = Nfields*(nP + p_Np*kP); /* stub resolve some other way */ if(mesh->vmapP[id]<0){ idP = mesh->vmapP[id]; /* -ve numbers */ } mesh->surfinfo[sk++] = idM; mesh->surfinfo[sk++] = idP; mesh->surfinfo[sk++] = sJk[f]/(2.*J); mesh->surfinfo[sk++] = (idM==idP)?-1.:1.; mesh->surfinfo[sk++] = nxk[f]; mesh->surfinfo[sk++] = nyk[f]; mesh->surfinfo[sk++] = nzk[f]; } } } }
double InitOCCA3d(Mesh *mesh, int Nfields){ device.setup("mode = OpenCL, platformID = 0, deviceID = 2"); /* Q */ int sz = mesh->K*(BSIZE)*p_Nfields*sizeof(float); float *f_Q = (float*) calloc(mesh->K*BSIZE*p_Nfields, sizeof(float)); c_Q = device.malloc(sz, f_Q); c_rhsQ = device.malloc(sz, f_Q); c_resQ = device.malloc(sz, f_Q); printf("sz1= %d\n", sz); sz = mesh->parNtotalout*sizeof(float); c_tmp = device.malloc(sz+1, f_Q); // should not use f_Q c_partQ = device.malloc(sz+1, f_Q); printf("sz2= %d\n", sz); /* LIFT */ sz = p_Np*(p_Nfp)*p_Nfaces*sizeof(float); float *f_LIFT = (float*) malloc(sz); int skL = 0; for(int m=0;m<p_Nfp;++m){ for(int n=0;n<p_Np;++n){ for(int f=0;f<p_Nfaces;++f){ f_LIFT[skL++] = mesh->LIFT[0][p_Nfp*p_Nfaces*n+(f+p_Nfaces*m)]; } } } c_LIFT = device.malloc(sz, f_LIFT); /* DrDsDt */ sz = BSIZE*BSIZE*4*sizeof(float); float* h_DrDsDt = (float*) calloc(BSIZE*BSIZE*4, sizeof(float)); int sk = 0; /* note transposed arrays to avoid "bank conflicts" */ for(int n=0;n<p_Np;++n){ for(int m=0;m<p_Np;++m){ h_DrDsDt[4*(m+n*BSIZE)+0] = mesh->Dr[0][n+m*p_Np]; h_DrDsDt[4*(m+n*BSIZE)+1] = mesh->Ds[0][n+m*p_Np]; h_DrDsDt[4*(m+n*BSIZE)+2] = mesh->Dt[0][n+m*p_Np]; } } c_DrDsDt = device.malloc(sz, h_DrDsDt); free(h_DrDsDt); /* vgeo */ double drdx, dsdx, dtdx; double drdy, dsdy, dtdy; double drdz, dsdz, dtdz, J; float *vgeo = (float*) calloc(12*mesh->K, sizeof(float)); for(int k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); vgeo[k*12+0] = drdx; vgeo[k*12+1] = drdy; vgeo[k*12+2] = drdz; vgeo[k*12+4] = dsdx; vgeo[k*12+5] = dsdy; vgeo[k*12+6] = dsdz; vgeo[k*12+8] = dtdx; vgeo[k*12+9] = dtdy; vgeo[k*12+10] = dtdz; } sz = mesh->K*12*sizeof(float); c_vgeo = device.malloc(sz, vgeo); /* surfinfo (vmapM, vmapP, Fscale, Bscale, nx, ny, nz, 0) */ int sz5 = mesh->K*p_Nfp*p_Nfaces*5*sizeof(float); float* h_surfinfo = (float*) malloc(sz5); int sz2 = mesh->K*p_Nfp*p_Nfaces*2*sizeof(int); int* h_mapinfo = (int*) malloc(sz2); /* local-local info */ sk = 0; int skP = -1; double *nxk = BuildVector(mesh->Nfaces); double *nyk = BuildVector(mesh->Nfaces); double *nzk = BuildVector(mesh->Nfaces); double *sJk = BuildVector(mesh->Nfaces); double dt = 1e6; for(int k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); Normals3d(mesh, k, nxk, nyk, nzk, sJk); for(int f=0;f<mesh->Nfaces;++f){ dt = min(dt, J/sJk[f]); for(int m=0;m<p_Nfp;++m){ int n = m + f*p_Nfp + p_Nfp*p_Nfaces*k; int idM = mesh->vmapM[n]; int idP = mesh->vmapP[n]; int nM = idM%p_Np; int nP = idP%p_Np; int kM = (idM-nM)/p_Np; int kP = (idP-nP)/p_Np; idM = nM + Nfields*BSIZE*kM; idP = nP + Nfields*BSIZE*kP; /* stub resolve some other way */ if(mesh->vmapP[n]<0){ idP = mesh->vmapP[n]; /* -ve numbers */ } sk = 2*p_Nfp*p_Nfaces*k+m+f*p_Nfp; h_mapinfo[sk + 0*p_Nfp*p_Nfaces] = idM; h_mapinfo[sk + 1*p_Nfp*p_Nfaces] = idP; sk = 5*p_Nfp*p_Nfaces*k+m+f*p_Nfp; h_surfinfo[sk + 0*p_Nfp*p_Nfaces] = sJk[f]/(2.*J); h_surfinfo[sk + 1*p_Nfp*p_Nfaces] = (idM==idP)?-1.:1.; h_surfinfo[sk + 2*p_Nfp*p_Nfaces] = nxk[f]; h_surfinfo[sk + 3*p_Nfp*p_Nfaces] = nyk[f]; h_surfinfo[sk + 4*p_Nfp*p_Nfaces] = nzk[f]; } } } c_mapinfo = device.malloc(sz2, h_mapinfo); c_surfinfo = device.malloc(sz5, h_surfinfo); free(h_mapinfo); free(h_surfinfo); printf("mesh->parNtotalout=%d\n", mesh->parNtotalout); sz = mesh->parNtotalout*sizeof(int); c_parmapOUT = device.malloc(sz+1, mesh->parmapOUT); /* now build kernels */ occa::kernelInfo dgInfo; dgInfo.addDefine("p_Np", p_Np); dgInfo.addDefine("p_Nfp", p_Nfp); dgInfo.addDefine("p_Nfaces", p_Nfaces); dgInfo.addDefine("p_Nfields", p_Nfields); dgInfo.addDefine("BSIZE", BSIZE); dgInfo.addDefine("p_max_NfpNfaces_Np", max(p_Nfp*p_Nfaces, p_Np)); volumeKernel = device.buildKernelFromSource("src/MaxwellsVolumeKernel3D.okl", "MaxwellsVolumeKernel3D", dgInfo); surfaceKernel = device.buildKernelFromSource("src/MaxwellsSurfaceKernel3D.okl", "MaxwellsSurfaceKernel3D", dgInfo); rkKernel = device.buildKernelFromSource("src/MaxwellsRKKernel3D.okl", "MaxwellsRKKernel3D", dgInfo); partialGetKernel = device.buildKernelFromSource("src/MaxwellsPartialGetKernel3D.okl", "MaxwellsPartialGetKernel3D", dgInfo); #if 0 diagnose_array<float>("c_DrDsDt", c_DrDsDt, 4*BSIZE*BSIZE); diagnose_array<float>("c_LIFT", c_LIFT, p_Nfaces*p_Nfp*p_Np); diagnose_array<float>("c_vgeo", c_vgeo, mesh->K*12); diagnose_array<float>("c_surfinfo", c_surfinfo, p_Nfaces*p_Nfp*7*mesh->K); diagnose_array<int> ("c_parmapOUT", c_parmapOUT, mesh->parNtotalout); #endif return dt; }