int main(int argc, char** argv) { std::vector<int> heapVector; std::vector<std::string> commandVector; BuildVector(commandVector, std::cin); heapVector.reserve(500); MinHeap<int> heap(heapVector); std::ofstream file; file.open("output.txt"); std::vector<std::string>::iterator iter; for (iter = commandVector.begin(); iter != commandVector.end(); ++iter) { // split on the space to get the operation and command auto command = SplitString(*iter, ' '); auto operation = (*command.begin())[0]; RunOperation(operation, command, heap, std::cout); } file.close(); return 0; }
void LoadBalance3d(Mesh *mesh){ int n,p,k,v,f; int nprocs = mesh->nprocs; int procid = mesh->procid; int **EToV = mesh->EToV; double **VX = mesh->GX; double **VY = mesh->GY; double **VZ = mesh->GZ; if(!procid) printf("Root: Entering LoadBalance\n"); int Nverts = mesh->Nverts; int *Kprocs = BuildIntVector(nprocs); /* local number of elements */ int Klocal = mesh->K; /* find number of elements on all processors */ MPI_Allgather(&Klocal, 1, MPI_INT, Kprocs, 1, MPI_INT, MPI_COMM_WORLD); /* element distribution -- cumulative element count on processes */ idxtype *elmdist = idxmalloc(nprocs+1, "elmdist"); elmdist[0] = 0; for(p=0;p<nprocs;++p) elmdist[p+1] = elmdist[p] + Kprocs[p]; /* list of element starts */ idxtype *eptr = idxmalloc(Klocal+1, "eptr"); eptr[0] = 0; for(k=0;k<Klocal;++k) eptr[k+1] = eptr[k] + Nverts; /* local element to vertex */ idxtype *eind = idxmalloc(Nverts*Klocal, "eind"); for(k=0;k<Klocal;++k) for(n=0;n<Nverts;++n) eind[k*Nverts+n] = EToV[k][n]; /* weight per element */ idxtype *elmwgt = idxmalloc(Klocal, "elmwgt"); for(k=0;k<Klocal;++k) elmwgt[k] = 1.; /* weight flag */ int wgtflag = 0; /* number flag (1=fortran, 0=c) */ int numflag = 0; /* ncon = 1 */ int ncon = 1; /* nodes on element face */ int ncommonnodes = 3; /* number of partitions */ int nparts = nprocs; /* tpwgts */ float *tpwgts = (float*) calloc(Klocal, sizeof(float)); for(k=0;k<Klocal;++k) tpwgts[k] = 1./(float)nprocs; float ubvec[MAXNCON]; for (n=0; n<ncon; ++n) ubvec[n] = UNBALANCE_FRACTION; int options[10]; options[0] = 1; options[PMV3_OPTION_DBGLVL] = 7; options[PMV3_OPTION_SEED] = 0; int edgecut; idxtype *part = idxmalloc(Klocal, "part"); MPI_Comm comm; MPI_Comm_dup(MPI_COMM_WORLD, &comm); ParMETIS_V3_PartMeshKway (elmdist, eptr, eind, elmwgt, &wgtflag, &numflag, &ncon, &ncommonnodes, &nparts, tpwgts, ubvec, options, &edgecut, part, &comm); int **outlist = (int**) calloc(nprocs, sizeof(int*)); double **xoutlist = (double**) calloc(nprocs, sizeof(double*)); double **youtlist = (double**) calloc(nprocs, sizeof(double*)); double **zoutlist = (double**) calloc(nprocs, sizeof(double*)); int *outK = (int*) calloc(nprocs, sizeof(int)); int *inK = (int*) calloc(nprocs, sizeof(int)); MPI_Request *inrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *outrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *xinrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *xoutrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *yinrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *youtrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *zinrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *zoutrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); for(k=0;k<Klocal;++k) ++outK[part[k]]; /* get count of incoming elements from each process */ MPI_Alltoall(outK, 1, MPI_INT, inK, 1, MPI_INT, MPI_COMM_WORLD); /* count totals on each process */ int * newKprocs = BuildIntVector(nprocs); MPI_Allreduce(outK, newKprocs, nprocs, MPI_INT, MPI_SUM, MPI_COMM_WORLD); int totalinK = 0; for(p=0;p<nprocs;++p){ totalinK += inK[p]; } int **newEToV = BuildIntMatrix(totalinK, Nverts); double **newVX = BuildMatrix(totalinK, Nverts); double **newVY = BuildMatrix(totalinK, Nverts); double **newVZ = BuildMatrix(totalinK, Nverts); int cnt = 0; for(p=0;p<nprocs;++p){ MPI_Irecv(newEToV[cnt], Nverts*inK[p], MPI_INT, p, 666+p, MPI_COMM_WORLD, inrequests+p); MPI_Irecv(newVX[cnt], Nverts*inK[p], MPI_DOUBLE, p, 1666+p, MPI_COMM_WORLD, xinrequests+p); MPI_Irecv(newVY[cnt], Nverts*inK[p], MPI_DOUBLE, p, 2666+p, MPI_COMM_WORLD, yinrequests+p); MPI_Irecv(newVZ[cnt], Nverts*inK[p], MPI_DOUBLE, p, 3666+p, MPI_COMM_WORLD, zinrequests+p); cnt = cnt + inK[p]; } for(p=0;p<nprocs;++p){ int cnt = 0; outlist[p] = BuildIntVector(Nverts*outK[p]); xoutlist[p] = BuildVector(Nverts*outK[p]); youtlist[p] = BuildVector(Nverts*outK[p]); zoutlist[p] = BuildVector(Nverts*outK[p]); for(k=0;k<Klocal;++k) if(part[k]==p){ for(v=0;v<Nverts;++v){ outlist[p][cnt] = EToV[k][v]; xoutlist[p][cnt] = VX[k][v]; youtlist[p][cnt] = VY[k][v]; zoutlist[p][cnt] = VZ[k][v]; ++cnt; } } MPI_Isend(outlist[p], Nverts*outK[p], MPI_INT, p, 666+procid, MPI_COMM_WORLD, outrequests+p); MPI_Isend(xoutlist[p], Nverts*outK[p], MPI_DOUBLE, p, 1666+procid, MPI_COMM_WORLD, xoutrequests+p); MPI_Isend(youtlist[p], Nverts*outK[p], MPI_DOUBLE, p, 2666+procid, MPI_COMM_WORLD, youtrequests+p); MPI_Isend(zoutlist[p], Nverts*outK[p], MPI_DOUBLE, p, 3666+procid, MPI_COMM_WORLD, zoutrequests+p); } MPI_Status *instatus = (MPI_Status*) calloc(nprocs, sizeof(MPI_Status)); MPI_Status *outstatus = (MPI_Status*) calloc(nprocs, sizeof(MPI_Status)); MPI_Waitall(nprocs, inrequests, instatus); MPI_Waitall(nprocs, xinrequests, instatus); MPI_Waitall(nprocs, yinrequests, instatus); MPI_Waitall(nprocs, zinrequests, instatus); MPI_Waitall(nprocs, outrequests, outstatus); MPI_Waitall(nprocs, xoutrequests, outstatus); MPI_Waitall(nprocs, youtrequests, outstatus); MPI_Waitall(nprocs, zoutrequests, outstatus); if(mesh->GX!=NULL){ DestroyMatrix(mesh->GX); DestroyMatrix(mesh->GY); DestroyMatrix(mesh->GZ); DestroyIntMatrix(mesh->EToV); } mesh->GX = newVX; mesh->GY = newVY; mesh->GZ = newVZ; mesh->EToV = newEToV; mesh->K = totalinK; for(p=0;p<nprocs;++p){ if(outlist[p]){ free(outlist[p]); free(xoutlist[p]); free(youtlist[p]); free(zoutlist[p]); } } free(outK); free(inK); free(inrequests); free(outrequests); free(xinrequests); free(xoutrequests); free(yinrequests); free(youtrequests); free(zinrequests); free(zoutrequests); free(instatus); free(outstatus); }
double InitCPU3d(Mesh *mesh, int Nfields){ printf("Np = %d, BSIZE = %d\n", p_Np, BSIZE); /* Q */ int sz = mesh->K*(p_Np)*Nfields*sizeof(float); /* TW BLOCK */ mesh->f_Q = (float*) calloc(mesh->K*p_Np*Nfields, sizeof(float)); mesh->f_rhsQ = (float*) calloc(mesh->K*p_Np*Nfields, sizeof(float)); mesh->f_resQ = (float*) calloc(mesh->K*p_Np*Nfields, sizeof(float)); /* float LIFT */ sz = p_Np*(p_Nfp)*(p_Nfaces)*sizeof(float); mesh->f_LIFT = (float*) malloc(sz); int sk = 0, n, m, f, k; for(n=0;n<p_Np;++n){ for(m=0;m<p_Nfp*p_Nfaces;++m){ mesh->f_LIFT[sk++] = mesh->LIFT[n][m]; } } /* float Dr & Ds */ sz = p_Np*p_Np*sizeof(float); mesh->f_Dr = (float*) malloc(sz); mesh->f_Ds = (float*) malloc(sz); mesh->f_Dt = (float*) malloc(sz); sk = 0; for(n=0;n<p_Np;++n){ for(m=0;m<p_Np;++m){ mesh->f_Dr[sk] = mesh->Dr[n][m]; mesh->f_Ds[sk] = mesh->Ds[n][m]; mesh->f_Dt[sk] = mesh->Dt[n][m]; ++sk; } } /* vgeo */ double drdx, dsdx, dtdx; double drdy, dsdy, dtdy; double drdz, dsdz, dtdz, J; mesh->vgeo = (float*) calloc(12*mesh->K, sizeof(float)); for(k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); mesh->vgeo[k*12+0] = drdx; mesh->vgeo[k*12+1] = drdy; mesh->vgeo[k*12+2] = drdz; mesh->vgeo[k*12+4] = dsdx; mesh->vgeo[k*12+5] = dsdy; mesh->vgeo[k*12+6] = dsdz; mesh->vgeo[k*12+8] = dtdx; mesh->vgeo[k*12+9] = dtdy; mesh->vgeo[k*12+10] = dtdz; } /* surfinfo (vmapM, vmapP, Fscale, Bscale, nx, ny, nz, 0) */ sz = mesh->K*p_Nfp*p_Nfaces*7*sizeof(float); mesh->surfinfo = (float*) malloc(sz); /* local-local info */ sk = 0; int skP = -1; double *nxk = BuildVector(mesh->Nfaces); double *nyk = BuildVector(mesh->Nfaces); double *nzk = BuildVector(mesh->Nfaces); double *sJk = BuildVector(mesh->Nfaces); double dt = 1e6; sk = 0; for(k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); Normals3d(mesh, k, nxk, nyk, nzk, sJk); for(f=0;f<mesh->Nfaces;++f){ dt = min(dt, J/sJk[f]); for(m=0;m<p_Nfp;++m){ int id = m + f*p_Nfp + p_Nfp*p_Nfaces*k; int idM = mesh->vmapM[id]; int idP = mesh->vmapP[id]; int nM = idM%p_Np; int nP = idP%p_Np; int kM = (idM-nM)/p_Np; int kP = (idP-nP)/p_Np; idM = Nfields*(nM + p_Np*kM); idP = Nfields*(nP + p_Np*kP); /* stub resolve some other way */ if(mesh->vmapP[id]<0){ idP = mesh->vmapP[id]; /* -ve numbers */ } mesh->surfinfo[sk++] = idM; mesh->surfinfo[sk++] = idP; mesh->surfinfo[sk++] = sJk[f]/(2.*J); mesh->surfinfo[sk++] = (idM==idP)?-1.:1.; mesh->surfinfo[sk++] = nxk[f]; mesh->surfinfo[sk++] = nyk[f]; mesh->surfinfo[sk++] = nzk[f]; } } } }
double InitOCCA3d(Mesh *mesh, int Nfields){ device.setup("mode = OpenCL, platformID = 0, deviceID = 2"); /* Q */ int sz = mesh->K*(BSIZE)*p_Nfields*sizeof(float); float *f_Q = (float*) calloc(mesh->K*BSIZE*p_Nfields, sizeof(float)); c_Q = device.malloc(sz, f_Q); c_rhsQ = device.malloc(sz, f_Q); c_resQ = device.malloc(sz, f_Q); printf("sz1= %d\n", sz); sz = mesh->parNtotalout*sizeof(float); c_tmp = device.malloc(sz+1, f_Q); // should not use f_Q c_partQ = device.malloc(sz+1, f_Q); printf("sz2= %d\n", sz); /* LIFT */ sz = p_Np*(p_Nfp)*p_Nfaces*sizeof(float); float *f_LIFT = (float*) malloc(sz); int skL = 0; for(int m=0;m<p_Nfp;++m){ for(int n=0;n<p_Np;++n){ for(int f=0;f<p_Nfaces;++f){ f_LIFT[skL++] = mesh->LIFT[0][p_Nfp*p_Nfaces*n+(f+p_Nfaces*m)]; } } } c_LIFT = device.malloc(sz, f_LIFT); /* DrDsDt */ sz = BSIZE*BSIZE*4*sizeof(float); float* h_DrDsDt = (float*) calloc(BSIZE*BSIZE*4, sizeof(float)); int sk = 0; /* note transposed arrays to avoid "bank conflicts" */ for(int n=0;n<p_Np;++n){ for(int m=0;m<p_Np;++m){ h_DrDsDt[4*(m+n*BSIZE)+0] = mesh->Dr[0][n+m*p_Np]; h_DrDsDt[4*(m+n*BSIZE)+1] = mesh->Ds[0][n+m*p_Np]; h_DrDsDt[4*(m+n*BSIZE)+2] = mesh->Dt[0][n+m*p_Np]; } } c_DrDsDt = device.malloc(sz, h_DrDsDt); free(h_DrDsDt); /* vgeo */ double drdx, dsdx, dtdx; double drdy, dsdy, dtdy; double drdz, dsdz, dtdz, J; float *vgeo = (float*) calloc(12*mesh->K, sizeof(float)); for(int k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); vgeo[k*12+0] = drdx; vgeo[k*12+1] = drdy; vgeo[k*12+2] = drdz; vgeo[k*12+4] = dsdx; vgeo[k*12+5] = dsdy; vgeo[k*12+6] = dsdz; vgeo[k*12+8] = dtdx; vgeo[k*12+9] = dtdy; vgeo[k*12+10] = dtdz; } sz = mesh->K*12*sizeof(float); c_vgeo = device.malloc(sz, vgeo); /* surfinfo (vmapM, vmapP, Fscale, Bscale, nx, ny, nz, 0) */ int sz5 = mesh->K*p_Nfp*p_Nfaces*5*sizeof(float); float* h_surfinfo = (float*) malloc(sz5); int sz2 = mesh->K*p_Nfp*p_Nfaces*2*sizeof(int); int* h_mapinfo = (int*) malloc(sz2); /* local-local info */ sk = 0; int skP = -1; double *nxk = BuildVector(mesh->Nfaces); double *nyk = BuildVector(mesh->Nfaces); double *nzk = BuildVector(mesh->Nfaces); double *sJk = BuildVector(mesh->Nfaces); double dt = 1e6; for(int k=0;k<mesh->K;++k){ GeometricFactors3d(mesh, k, &drdx, &dsdx, &dtdx, &drdy, &dsdy, &dtdy, &drdz, &dsdz, &dtdz, &J); Normals3d(mesh, k, nxk, nyk, nzk, sJk); for(int f=0;f<mesh->Nfaces;++f){ dt = min(dt, J/sJk[f]); for(int m=0;m<p_Nfp;++m){ int n = m + f*p_Nfp + p_Nfp*p_Nfaces*k; int idM = mesh->vmapM[n]; int idP = mesh->vmapP[n]; int nM = idM%p_Np; int nP = idP%p_Np; int kM = (idM-nM)/p_Np; int kP = (idP-nP)/p_Np; idM = nM + Nfields*BSIZE*kM; idP = nP + Nfields*BSIZE*kP; /* stub resolve some other way */ if(mesh->vmapP[n]<0){ idP = mesh->vmapP[n]; /* -ve numbers */ } sk = 2*p_Nfp*p_Nfaces*k+m+f*p_Nfp; h_mapinfo[sk + 0*p_Nfp*p_Nfaces] = idM; h_mapinfo[sk + 1*p_Nfp*p_Nfaces] = idP; sk = 5*p_Nfp*p_Nfaces*k+m+f*p_Nfp; h_surfinfo[sk + 0*p_Nfp*p_Nfaces] = sJk[f]/(2.*J); h_surfinfo[sk + 1*p_Nfp*p_Nfaces] = (idM==idP)?-1.:1.; h_surfinfo[sk + 2*p_Nfp*p_Nfaces] = nxk[f]; h_surfinfo[sk + 3*p_Nfp*p_Nfaces] = nyk[f]; h_surfinfo[sk + 4*p_Nfp*p_Nfaces] = nzk[f]; } } } c_mapinfo = device.malloc(sz2, h_mapinfo); c_surfinfo = device.malloc(sz5, h_surfinfo); free(h_mapinfo); free(h_surfinfo); printf("mesh->parNtotalout=%d\n", mesh->parNtotalout); sz = mesh->parNtotalout*sizeof(int); c_parmapOUT = device.malloc(sz+1, mesh->parmapOUT); /* now build kernels */ occa::kernelInfo dgInfo; dgInfo.addDefine("p_Np", p_Np); dgInfo.addDefine("p_Nfp", p_Nfp); dgInfo.addDefine("p_Nfaces", p_Nfaces); dgInfo.addDefine("p_Nfields", p_Nfields); dgInfo.addDefine("BSIZE", BSIZE); dgInfo.addDefine("p_max_NfpNfaces_Np", max(p_Nfp*p_Nfaces, p_Np)); volumeKernel = device.buildKernelFromSource("src/MaxwellsVolumeKernel3D.okl", "MaxwellsVolumeKernel3D", dgInfo); surfaceKernel = device.buildKernelFromSource("src/MaxwellsSurfaceKernel3D.okl", "MaxwellsSurfaceKernel3D", dgInfo); rkKernel = device.buildKernelFromSource("src/MaxwellsRKKernel3D.okl", "MaxwellsRKKernel3D", dgInfo); partialGetKernel = device.buildKernelFromSource("src/MaxwellsPartialGetKernel3D.okl", "MaxwellsPartialGetKernel3D", dgInfo); #if 0 diagnose_array<float>("c_DrDsDt", c_DrDsDt, 4*BSIZE*BSIZE); diagnose_array<float>("c_LIFT", c_LIFT, p_Nfaces*p_Nfp*p_Np); diagnose_array<float>("c_vgeo", c_vgeo, mesh->K*12); diagnose_array<float>("c_surfinfo", c_surfinfo, p_Nfaces*p_Nfp*7*mesh->K); diagnose_array<int> ("c_parmapOUT", c_parmapOUT, mesh->parNtotalout); #endif return dt; }
void BuildMaps2d(Mesh *mesh){ int nprocs = mesh->nprocs; int procid = mesh->procid; int K = mesh->K; int Nfaces = mesh->Nfaces; mesh->vmapM = BuildIntVector(p_Nfp*p_Nfaces*K); mesh->vmapP = BuildIntVector(p_Nfp*p_Nfaces*K); int m; int k1,f1,p1,n1,id1, k2,f2,p2,n2,id2; double x1, y1, x2, y2, d12; double *nxk = BuildVector(Nfaces); double *nyk = BuildVector(Nfaces); double *sJk = BuildVector(Nfaces); /* first build local */ for(k1=0;k1<K;++k1){ /* get some information about the face geometries */ Normals2d(mesh, k1, nxk, nyk, sJk); for(f1=0;f1<Nfaces;++f1){ /* volume -> face nodes */ for(n1=0;n1<p_Nfp;++n1){ id1 = n1+f1*p_Nfp+k1*p_Nfp*p_Nfaces; mesh->vmapM[id1] = mesh->Fmask[f1][n1] + k1*p_Np; } /* find neighbor */ k2 = mesh->EToE[k1][f1]; f2 = mesh->EToF[k1][f1]; p2 = mesh->EToP[k1][f1]; if(k1==k2 || procid!=p2 ){ for(n1=0;n1<p_Nfp;++n1){ id1 = n1+f1*p_Nfp+k1*p_Nfp*p_Nfaces; mesh->vmapP[id1] = k1*p_Np + mesh->Fmask[f1][n1]; } }else{ /* treat as boundary for the moment */ for(n1=0;n1<p_Nfp;++n1){ id1 = n1+f1*p_Nfp+k1*p_Nfp*p_Nfaces; x1 = mesh->x[k1][mesh->Fmask[f1][n1]]; y1 = mesh->y[k1][mesh->Fmask[f1][n1]]; for(n2=0;n2<p_Nfp;++n2){ id2 = n2+f2*p_Nfp+k2*p_Nfp*p_Nfaces; x2 = mesh->x[k2][mesh->Fmask[f2][n2]]; y2 = mesh->y[k2][mesh->Fmask[f2][n2]]; /* find normalized distance between these nodes */ /* [ use sJk as a measure of edge length (ignore factor of 2) ] */ d12 = ((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2))/(sJk[f1]*sJk[f1]); if(d12<NODETOL){ mesh->vmapP[id1] = k2*p_Np + mesh->Fmask[f2][n2]; } } } } } } #if 0 int n; for(n=0;n<p_Nfp*p_Nfaces*mesh->K;++n){ x1 = mesh->x[0][mesh->vmapM[n]]; y1 = mesh->y[0][mesh->vmapM[n]]; x2 = mesh->x[0][mesh->vmapP[n]]; y2 = mesh->y[0][mesh->vmapP[n]]; d12 = ((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2)); printf("n:%d %d -> %d d=%lg\n", n, mesh->vmapM[n], mesh->vmapP[n], d12); } #endif /* now build parallel maps */ double **xsend = (double**) calloc(nprocs, sizeof(double*)); double **ysend = (double**) calloc(nprocs, sizeof(double*)); double **xrecv = (double**) calloc(nprocs, sizeof(double*)); double **yrecv = (double**) calloc(nprocs, sizeof(double*)); int **Esend = (int**) calloc(nprocs, sizeof(int*)); int **Fsend = (int**) calloc(nprocs, sizeof(int*)); int **Erecv = (int**) calloc(nprocs, sizeof(int*)); int **Frecv = (int**) calloc(nprocs, sizeof(int*)); for(p2=0;p2<nprocs;++p2){ if(mesh->Npar[p2]){ xsend[p2] = BuildVector(mesh->Npar[p2]*p_Nfp); ysend[p2] = BuildVector(mesh->Npar[p2]*p_Nfp); Esend[p2] = BuildIntVector(mesh->Npar[p2]*p_Nfp); Fsend[p2] = BuildIntVector(mesh->Npar[p2]*p_Nfp); xrecv[p2] = BuildVector(mesh->Npar[p2]*p_Nfp); yrecv[p2] = BuildVector(mesh->Npar[p2]*p_Nfp); Erecv[p2] = BuildIntVector(mesh->Npar[p2]*p_Nfp); Frecv[p2] = BuildIntVector(mesh->Npar[p2]*p_Nfp); } } int *skP = BuildIntVector(nprocs); /* send coordinates in local order */ int cnt = 0; for(k1=0;k1<K;++k1){ for(f1=0;f1<p_Nfaces;++f1){ p2 = mesh->EToP[k1][f1]; if(p2!=procid){ for(n1=0;n1<p_Nfp;++n1){ xsend[p2][skP[p2]] = mesh->x[k1][mesh->Fmask[f1][n1]]; ysend[p2][skP[p2]] = mesh->y[k1][mesh->Fmask[f1][n1]]; Esend[p2][skP[p2]] = mesh->EToE[k1][f1]; Fsend[p2][skP[p2]] = mesh->EToF[k1][f1]; ++(skP[p2]); } } } } MPI_Request *xsendrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *ysendrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *xrecvrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *yrecvrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *Esendrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *Fsendrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *Erecvrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Request *Frecvrequests = (MPI_Request*) calloc(nprocs, sizeof(MPI_Request)); MPI_Status *status = (MPI_Status*) calloc(nprocs, sizeof(MPI_Status)); cnt = 0; for(p2=0;p2<nprocs;++p2){ if(p2!=procid && mesh->Npar[p2]!=0){ int Nout = mesh->Npar[p2]*p_Nfp; MPI_Isend(xsend[p2], Nout, MPI_DOUBLE, p2, 666+p2, MPI_COMM_WORLD, xsendrequests+cnt); MPI_Isend(ysend[p2], Nout, MPI_DOUBLE, p2, 1666+p2, MPI_COMM_WORLD, ysendrequests+cnt); MPI_Isend(Esend[p2], Nout, MPI_INT, p2, 2666+p2, MPI_COMM_WORLD, Esendrequests+cnt); MPI_Isend(Fsend[p2], Nout, MPI_INT, p2, 3666+p2, MPI_COMM_WORLD, Fsendrequests+cnt); MPI_Irecv(xrecv[p2], Nout, MPI_DOUBLE, p2, 666+procid, MPI_COMM_WORLD, xrecvrequests+cnt); MPI_Irecv(yrecv[p2], Nout, MPI_DOUBLE, p2, 1666+procid, MPI_COMM_WORLD, yrecvrequests+cnt); MPI_Irecv(Erecv[p2], Nout, MPI_INT, p2, 2666+procid, MPI_COMM_WORLD, Erecvrequests+cnt); MPI_Irecv(Frecv[p2], Nout, MPI_INT, p2, 3666+procid, MPI_COMM_WORLD, Frecvrequests+cnt); ++cnt; } } MPI_Waitall(cnt, xsendrequests, status); MPI_Waitall(cnt, ysendrequests, status); MPI_Waitall(cnt, Esendrequests, status); MPI_Waitall(cnt, Fsendrequests, status); MPI_Waitall(cnt, xrecvrequests, status); MPI_Waitall(cnt, yrecvrequests, status); MPI_Waitall(cnt, Erecvrequests, status); MPI_Waitall(cnt, Frecvrequests, status); /* add up the total number of outgoing/ingoing nodes */ mesh->parNtotalout = 0; for(p2=0;p2<nprocs;++p2) mesh->parNtotalout += skP[p2]*p_Nfields; mesh->parmapOUT = BuildIntVector(mesh->parNtotalout); /* now match up local nodes with the requested (recv'ed nodes) */ int idout = -1; int sk = 0; for(p2=0;p2<nprocs;++p2){ /* for each received face */ for(m=0;m<skP[p2];++m){ k1 = Erecv[p2][m]; f1 = Frecv[p2][m]; x2 = xrecv[p2][m]; y2 = yrecv[p2][m]; Normals2d(mesh, k1, nxk, nyk, sJk); for(n1=0;n1<p_Nfp;++n1){ x1 = mesh->x[k1][mesh->Fmask[f1][n1]]; y1 = mesh->y[k1][mesh->Fmask[f1][n1]]; d12 = ((x1-x2)*(x1-x2) + (y1-y2)*(y1-y2))/(sJk[f1]*sJk[f1]); if(d12<NODETOL){ int fld; for(fld=0;fld<p_Nfields;++fld){ #ifdef CUDA mesh->parmapOUT[sk++] = k1*BSIZE*p_Nfields+mesh->Fmask[f1][n1] + BSIZE*fld; #else mesh->parmapOUT[sk++] = p_Nfields*(k1*p_Np+mesh->Fmask[f1][n1]) + fld; #endif } } } } } /* create incoming node map */ int parcnt = -1; for(p2=0;p2<nprocs;++p2){ for(k1=0;k1<K;++k1){ for(f1=0;f1<p_Nfaces;++f1){ if(mesh->EToP[k1][f1]==p2 && p2!=procid){ for(n1=0;n1<p_Nfp;++n1){ id1 = n1+f1*p_Nfp+k1*p_Nfp*p_Nfaces; mesh->vmapP[id1] = parcnt; --parcnt; } } } } } /* buffers for communication */ mesh->f_outQ = (float*) calloc(mesh->parNtotalout+1, sizeof(float)); mesh->f_inQ = (float*) calloc(mesh->parNtotalout+1, sizeof(float)); }
Mesh *ReadMesh3d(char *filename){ int n; Mesh *mesh = (Mesh*) calloc(1, sizeof(Mesh)); char buf[BUFSIZ]; FILE *fp = fopen(filename, "r"); /* assume modified Gambit neutral format */ for(n=0;n<6;++n) fgets(buf, BUFSIZ, fp); fgets(buf, BUFSIZ, fp); sscanf(buf, "%d %d \n", &(mesh->Nv), &(mesh->K)); mesh->Nverts = 4; /* assume tets */ mesh->Nedges = 6; /* assume tets */ mesh->Nfaces = 4; /* assume tets */ fgets(buf, BUFSIZ, fp); fgets(buf, BUFSIZ, fp); /* read vertex coordinates */ double *VX = BuildVector(mesh->Nv); double *VY = BuildVector(mesh->Nv); double *VZ = BuildVector(mesh->Nv); for(n=0;n<mesh->Nv;++n){ fgets(buf, BUFSIZ, fp); sscanf(buf, "%*d %lf %lf %lf", VX+n, VY+n, VZ+n); } /* decide on parition */ int procid, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &procid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); mesh->procid = procid; mesh->nprocs = nprocs; /* assume this proc owns a block of elements */ int Klocal, Kstart; int *Kprocs = (int*) calloc(nprocs, sizeof(int)); int p; int **newEToV, *newKprocs; double **newVX, **newVY; Klocal = (int) ( (double)(mesh->K)/(double)nprocs ); for(p=0;p<nprocs-1;++p){ Kprocs[p] = Klocal; } Kprocs[p] = Klocal + mesh->K - nprocs*Klocal; Kstart= 0; for(p=0;p<procid;++p) Kstart += Kprocs[p]; Klocal = Kprocs[procid]; /* read element to vertex connectivity */ fgets(buf, BUFSIZ, fp); fgets(buf, BUFSIZ, fp); mesh->EToV = BuildIntMatrix(Klocal, mesh->Nverts); mesh->GX = BuildMatrix(Klocal, mesh->Nverts); mesh->GY = BuildMatrix(Klocal, mesh->Nverts); mesh->GZ = BuildMatrix(Klocal, mesh->Nverts); int sk = 0, v; for(n=0;n<mesh->K;++n){ fgets(buf, BUFSIZ, fp); if(n>=Kstart && n<Kstart+Klocal){ sscanf(buf, "%*d %*d %*d %d %d %d %d", mesh->EToV[sk]+0, mesh->EToV[sk]+1, mesh->EToV[sk]+2, mesh->EToV[sk]+3); /* correct to 0-index */ --(mesh->EToV[sk][0]); --(mesh->EToV[sk][1]); --(mesh->EToV[sk][2]); --(mesh->EToV[sk][3]); for(v=0;v<mesh->Nverts;++v){ mesh->GX[sk][v] = VX[mesh->EToV[sk][v]]; mesh->GY[sk][v] = VY[mesh->EToV[sk][v]]; mesh->GZ[sk][v] = VZ[mesh->EToV[sk][v]]; } ++sk; } } fgets(buf, BUFSIZ, fp); fgets(buf, BUFSIZ, fp); mesh->K = Klocal; fclose(fp); return mesh; }
/* TargetIntrinsicLower - To handle builtins, we want to expand the * invocation into normal LLVM code. If the target can handle the builtin, this * function should emit the expanded code and return true. */ bool TreeToLLVM::TargetIntrinsicLower(tree exp, unsigned FnCode, Value *DestLoc, Value *&Result, const Type *ResultType, std::vector<Value*> &Ops, SmallVector<tree, 8> &Args, BasicBlock *CurBB, bool ResIsSigned, bool ExpIsSigned) { switch (FnCode) { default: break; case ALTIVEC_BUILTIN_VADDFP: case ALTIVEC_BUILTIN_VADDUBM: case ALTIVEC_BUILTIN_VADDUHM: case ALTIVEC_BUILTIN_VADDUWM: Result = BinaryOperator::createAdd(Ops[0], Ops[1], "tmp", CurBB); return true; case ALTIVEC_BUILTIN_VSUBFP: case ALTIVEC_BUILTIN_VSUBUBM: case ALTIVEC_BUILTIN_VSUBUHM: case ALTIVEC_BUILTIN_VSUBUWM: Result = BinaryOperator::createSub(Ops[0], Ops[1], "tmp", CurBB); return true; case ALTIVEC_BUILTIN_VAND: Result = BinaryOperator::createAnd(Ops[0], Ops[1], "tmp", CurBB); return true; case ALTIVEC_BUILTIN_VANDC: Ops[1] = BinaryOperator::createNot(Ops[1], "tmp", CurBB); Result = BinaryOperator::createAnd(Ops[0], Ops[1], "tmp", CurBB); return true; case ALTIVEC_BUILTIN_VOR: Result = BinaryOperator::createOr(Ops[0], Ops[1], "tmp", CurBB); return true; case ALTIVEC_BUILTIN_VNOR: Result = BinaryOperator::createOr(Ops[0], Ops[1], "tmp", CurBB); Result = BinaryOperator::createNot(Result, "tmp", CurBB); return true; case ALTIVEC_BUILTIN_VXOR: Result = BinaryOperator::createXor(Ops[0], Ops[1], "tmp", CurBB); return true; case ALTIVEC_BUILTIN_LVSL: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvsl", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_LVSR: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvsr", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_LVX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_LVXL: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvxl", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_LVEBX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvebx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_LVEHX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvehx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_LVEWX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 0, "llvm.ppc.altivec.lvewx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_STVX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 1, "llvm.ppc.altivec.stvx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_STVEBX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 1, "llvm.ppc.altivec.stvebx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_STVEHX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 1, "llvm.ppc.altivec.stvehx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_STVEWX: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 1, "llvm.ppc.altivec.stvewx", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_STVXL: { static Constant *Cache = NULL; MergeIntPtrOperand(this, Cache, 1, "llvm.ppc.altivec.stvxl", ResultType, Ops, CurBB, Result); } return true; case ALTIVEC_BUILTIN_VSPLTISB: if (Constant *Elt = dyn_cast<ConstantInt>(Ops[0])) { Elt = ConstantExpr::getIntegerCast(Elt, Type::Int8Ty, true); Result = BuildVector(Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, NULL); } else { error("%Helement must be an immediate", &EXPR_LOCATION(exp)); Result = UndefValue::get(VectorType::get(Type::Int8Ty, 16)); } return true; case ALTIVEC_BUILTIN_VSPLTISH: if (Constant *Elt = dyn_cast<ConstantInt>(Ops[0])) { Elt = ConstantExpr::getIntegerCast(Elt, Type::Int16Ty, true); Result = BuildVector(Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, NULL); } else { error("%Helement must be an immediate", &EXPR_LOCATION(exp)); Result = UndefValue::get(VectorType::get(Type::Int16Ty, 8)); } return true; case ALTIVEC_BUILTIN_VSPLTISW: if (Constant *Elt = dyn_cast<ConstantInt>(Ops[0])) { Elt = ConstantExpr::getIntegerCast(Elt, Type::Int32Ty, true); Result = BuildVector(Elt, Elt, Elt, Elt, NULL); } else { error("%Hmask must be an immediate", &EXPR_LOCATION(exp)); Result = UndefValue::get(VectorType::get(Type::Int32Ty, 4)); } return true; case ALTIVEC_BUILTIN_VSPLTB: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[1])) { int EV = Elt->getZExtValue(); Result = BuildVectorShuffle(Ops[0], Ops[0], EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV); } else { error("%Helement number must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VSPLTH: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[1])) { int EV = Elt->getZExtValue(); Result = BuildVectorShuffle(Ops[0], Ops[0], EV, EV, EV, EV, EV, EV, EV, EV); } else { error("%Helement number must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VSPLTW: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[1])) { int EV = Elt->getZExtValue(); Result = BuildVectorShuffle(Ops[0], Ops[0], EV, EV, EV, EV); } else { error("%Helement number must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VSLDOI_16QI: case ALTIVEC_BUILTIN_VSLDOI_8HI: case ALTIVEC_BUILTIN_VSLDOI_4SI: case ALTIVEC_BUILTIN_VSLDOI_4SF: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[2])) { /* Map all of these to a shuffle. */ unsigned Amt = Elt->getZExtValue() & 15; VectorType *v16i8 = VectorType::get(Type::Int8Ty, 16); Value *Op0 = Ops[0]; Instruction::CastOps opc = CastInst::getCastOpcode(Op0, IntrinsicOpIsSigned(Args,0), ResultType, false); Ops[0] = CastToType(opc, Op0, v16i8); Value *Op1 = Ops[1]; opc = CastInst::getCastOpcode(Op1, IntrinsicOpIsSigned(Args,1), ResultType, false); Ops[1] = CastToType(opc, Op1, v16i8); Result = BuildVectorShuffle(Ops[0], Ops[1], Amt, Amt+1, Amt+2, Amt+3, Amt+4, Amt+5, Amt+6, Amt+7, Amt+8, Amt+9, Amt+10, Amt+11, Amt+12, Amt+13, Amt+14, Amt+15); } else { error("%Hshift amount must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VPKUHUM: { Value *Op0 = Ops[0]; Instruction::CastOps opc = CastInst::getCastOpcode(Op0, IntrinsicOpIsSigned(Args,0), ResultType, ExpIsSigned); Ops[0] = CastInst::create(opc, Op0, ResultType, Op0->getName(), CurBB); Value *Op1 = Ops[1]; opc = CastInst::getCastOpcode(Op1, IntrinsicOpIsSigned(Args,1), ResultType, ExpIsSigned); Ops[1] = CastInst::create(opc, Op1, ResultType, Op1->getName(), CurBB); Result = BuildVectorShuffle(Ops[0], Ops[1], 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); return true; } case ALTIVEC_BUILTIN_VPKUWUM: { Value *Op0 = Ops[0]; Instruction::CastOps opc = CastInst::getCastOpcode(Op0, IntrinsicOpIsSigned(Args,0), ResultType, ExpIsSigned); Ops[0] = CastInst::create(opc, Op0, ResultType, Op0->getName(), CurBB); Value *Op1 = Ops[1]; opc = CastInst::getCastOpcode(Op1, IntrinsicOpIsSigned(Args,1), ResultType, ExpIsSigned); Ops[1] = CastInst::create(opc, Op1, ResultType, Op1->getName(), CurBB); Result = BuildVectorShuffle(Ops[0], Ops[1], 1, 3, 5, 7, 9, 11, 13, 15); return true; } case ALTIVEC_BUILTIN_VMRGHB: Result = BuildVectorShuffle(Ops[0], Ops[1], 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return true; case ALTIVEC_BUILTIN_VMRGHH: Result = BuildVectorShuffle(Ops[0], Ops[1], 0, 8, 1, 9, 2, 10, 3, 11); return true; case ALTIVEC_BUILTIN_VMRGHW: Result = BuildVectorShuffle(Ops[0], Ops[1], 0, 4, 1, 5); return true; case ALTIVEC_BUILTIN_VMRGLB: Result = BuildVectorShuffle(Ops[0], Ops[1], 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return true; case ALTIVEC_BUILTIN_VMRGLH: Result = BuildVectorShuffle(Ops[0], Ops[1], 4, 12, 5, 13, 6, 14, 7, 15); return true; case ALTIVEC_BUILTIN_VMRGLW: Result = BuildVectorShuffle(Ops[0], Ops[1], 2, 6, 3, 7); return true; case ALTIVEC_BUILTIN_ABS_V4SF: { /* and out sign bits */ VectorType *v4i32 = VectorType::get(Type::Int32Ty, 4); Ops[0] = new BitCastInst(Ops[0], v4i32, Ops[0]->getName(),CurBB); Constant *C = ConstantInt::get(Type::Int32Ty, 0x7FFFFFFF); C = ConstantVector::get(std::vector<Constant*>(4, C)); Result = BinaryOperator::createAnd(Ops[0], C, "tmp", CurBB); TargetIntrinsicCastResult(Result, ResultType, ResIsSigned, ExpIsSigned, CurBB); return true; } case ALTIVEC_BUILTIN_ABS_V4SI: case ALTIVEC_BUILTIN_ABS_V8HI: case ALTIVEC_BUILTIN_ABS_V16QI: { /* iabs(x) -> smax(x, 0-x) */ Result = BinaryOperator::createNeg(Ops[0], "tmp", CurBB); /* get the right smax intrinsic. */ static Constant *smax[3]; const VectorType *PTy = cast<VectorType>(ResultType); unsigned N = GetAltivecTypeNumFromType(PTy->getElementType()); if (smax[N] == 0) { Module *M = CurBB->getParent()->getParent(); smax[N] = M->getOrInsertFunction(std::string("llvm.ppc.altivec.vmaxs")+ GetAltivecLetterFromType(PTy->getElementType()), ResultType, ResultType, ResultType, NULL); } Result = new CallInst(smax[N], Ops[0], Result, "tmp", CurBB); return true; } case ALTIVEC_BUILTIN_ABSS_V4SI: case ALTIVEC_BUILTIN_ABSS_V8HI: case ALTIVEC_BUILTIN_ABSS_V16QI: { /* iabss(x) -> smax(x, satsub(0,x)) */ static Constant *sxs[3], *smax[3]; /* get the right satsub intrinsic. */ const VectorType *PTy = cast<VectorType>(ResultType); unsigned N = GetAltivecTypeNumFromType(PTy->getElementType()); if (sxs[N] == 0) { Module *M = CurBB->getParent()->getParent(); sxs[N] = M->getOrInsertFunction(std::string("llvm.ppc.altivec.vsubs")+ GetAltivecLetterFromType(PTy->getElementType())+"s", ResultType, ResultType, ResultType, NULL); } Result = Constant::getNullValue(ResultType); Result = new CallInst(sxs[N], Result, Ops[0], "tmp", CurBB); /* get the right smax intrinsic. */ if (smax[N] == 0) { Module *M = CurBB->getParent()->getParent(); smax[N] = M->getOrInsertFunction(std::string("llvm.ppc.altivec.vmaxs")+ GetAltivecLetterFromType(PTy->getElementType()), ResultType, ResultType, ResultType, NULL); } Result = new CallInst(smax[N], Ops[0], Result, "tmp", CurBB); return true; } } return false; }
// TargetIntrinsicLower - To handle builtins, we want to expand the //invocation into normal LLVM code. If the target can handle the builtin, this //function should emit the expanded code and return true. // bool TreeToLLVM::TargetIntrinsicLower(tree exp, unsigned FnCode, const MemRef *DestLoc, Value *&Result, const Type *ResultType, std::vector<Value*> &Ops) { switch (FnCode) { default: break; case ALTIVEC_BUILTIN_VADDFP: case ALTIVEC_BUILTIN_VADDUBM: case ALTIVEC_BUILTIN_VADDUHM: case ALTIVEC_BUILTIN_VADDUWM: Result = Builder.CreateAdd(Ops[0], Ops[1], "tmp"); return true; case ALTIVEC_BUILTIN_VSUBFP: case ALTIVEC_BUILTIN_VSUBUBM: case ALTIVEC_BUILTIN_VSUBUHM: case ALTIVEC_BUILTIN_VSUBUWM: Result = Builder.CreateSub(Ops[0], Ops[1], "tmp"); return true; case ALTIVEC_BUILTIN_VAND: Result = Builder.CreateAnd(Ops[0], Ops[1], "tmp"); return true; case ALTIVEC_BUILTIN_VANDC: Ops[1] = Builder.CreateNot(Ops[1], "tmp"); Result = Builder.CreateAnd(Ops[0], Ops[1], "tmp"); return true; case ALTIVEC_BUILTIN_VOR: Result = Builder.CreateOr(Ops[0], Ops[1], "tmp"); return true; case ALTIVEC_BUILTIN_VNOR: Result = Builder.CreateOr(Ops[0], Ops[1], "tmp"); Result = Builder.CreateNot(Result, "tmp"); return true; case ALTIVEC_BUILTIN_VXOR: Result = Builder.CreateXor(Ops[0], Ops[1], "tmp"); return true; case ALTIVEC_BUILTIN_LVSL: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvsl, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_LVSR: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvsr, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_LVX: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_LVXL: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvxl, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_LVEBX: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvebx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_LVEHX: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvehx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_LVEWX: MergeIntPtrOperand(this, 0, Intrinsic::ppc_altivec_lvewx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_STVX: MergeIntPtrOperand(this, 1, Intrinsic::ppc_altivec_stvx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_STVEBX: MergeIntPtrOperand(this, 1, Intrinsic::ppc_altivec_stvebx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_STVEHX: MergeIntPtrOperand(this, 1, Intrinsic::ppc_altivec_stvehx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_STVEWX: MergeIntPtrOperand(this, 1, Intrinsic::ppc_altivec_stvewx, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_STVXL: MergeIntPtrOperand(this, 1, Intrinsic::ppc_altivec_stvxl, ResultType, Ops, Builder, Result); return true; case ALTIVEC_BUILTIN_VSPLTISB: if (Constant *Elt = dyn_cast<ConstantInt>(Ops[0])) { Elt = ConstantExpr::getIntegerCast(Elt, Type::Int8Ty, true); Result = BuildVector(Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, NULL); } else { error("%Helement must be an immediate", &EXPR_LOCATION(exp)); Result = UndefValue::get(VectorType::get(Type::Int8Ty, 16)); } return true; case ALTIVEC_BUILTIN_VSPLTISH: if (Constant *Elt = dyn_cast<ConstantInt>(Ops[0])) { Elt = ConstantExpr::getIntegerCast(Elt, Type::Int16Ty, true); Result = BuildVector(Elt, Elt, Elt, Elt, Elt, Elt, Elt, Elt, NULL); } else { error("%Helement must be an immediate", &EXPR_LOCATION(exp)); Result = UndefValue::get(VectorType::get(Type::Int16Ty, 8)); } return true; case ALTIVEC_BUILTIN_VSPLTISW: if (Constant *Elt = dyn_cast<ConstantInt>(Ops[0])) { Elt = ConstantExpr::getIntegerCast(Elt, Type::Int32Ty, true); Result = BuildVector(Elt, Elt, Elt, Elt, NULL); } else { error("%Hmask must be an immediate", &EXPR_LOCATION(exp)); Result = UndefValue::get(VectorType::get(Type::Int32Ty, 4)); } return true; case ALTIVEC_BUILTIN_VSPLTB: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[1])) { int EV = Elt->getZExtValue(); Result = BuildVectorShuffle(Ops[0], Ops[0], EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV, EV); } else { error("%Helement number must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VSPLTH: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[1])) { int EV = Elt->getZExtValue(); // gcc accepts anything up to 31, and there is code that tests for it, // although it doesn't seem to make sense. Hardware behaves as if mod 8. if (EV>7 && EV<=31) EV = EV%8; Result = BuildVectorShuffle(Ops[0], Ops[0], EV, EV, EV, EV, EV, EV, EV, EV); } else { error("%Helement number must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VSPLTW: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[1])) { int EV = Elt->getZExtValue(); // gcc accepts anything up to 31, and there is code that tests for it, // although it doesn't seem to make sense. Hardware behaves as if mod 4. if (EV>3 && EV<=31) EV = EV%4; Result = BuildVectorShuffle(Ops[0], Ops[0], EV, EV, EV, EV); } else { error("%Helement number must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VSLDOI_16QI: case ALTIVEC_BUILTIN_VSLDOI_8HI: case ALTIVEC_BUILTIN_VSLDOI_4SI: case ALTIVEC_BUILTIN_VSLDOI_4SF: if (ConstantInt *Elt = dyn_cast<ConstantInt>(Ops[2])) { /* Map all of these to a shuffle. */ unsigned Amt = Elt->getZExtValue() & 15; VectorType *v16i8 = VectorType::get(Type::Int8Ty, 16); Ops[0] = Builder.CreateBitCast(Ops[0], v16i8, "tmp"); Ops[1] = Builder.CreateBitCast(Ops[1], v16i8, "tmp"); Result = BuildVectorShuffle(Ops[0], Ops[1], Amt, Amt+1, Amt+2, Amt+3, Amt+4, Amt+5, Amt+6, Amt+7, Amt+8, Amt+9, Amt+10, Amt+11, Amt+12, Amt+13, Amt+14, Amt+15); } else { error("%Hshift amount must be an immediate", &EXPR_LOCATION(exp)); Result = Ops[0]; } return true; case ALTIVEC_BUILTIN_VPKUHUM: Ops[0] = Builder.CreateBitCast(Ops[0], ResultType, "tmp"); Ops[1] = Builder.CreateBitCast(Ops[1], ResultType, "tmp"); Result = BuildVectorShuffle(Ops[0], Ops[1], 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); return true; case ALTIVEC_BUILTIN_VPKUWUM: Ops[0] = Builder.CreateBitCast(Ops[0], ResultType, "tmp"); Ops[1] = Builder.CreateBitCast(Ops[1], ResultType, "tmp"); Result = BuildVectorShuffle(Ops[0], Ops[1], 1, 3, 5, 7, 9, 11, 13, 15); return true; case ALTIVEC_BUILTIN_VMRGHB: Result = BuildVectorShuffle(Ops[0], Ops[1], 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return true; case ALTIVEC_BUILTIN_VMRGHH: Result = BuildVectorShuffle(Ops[0], Ops[1], 0, 8, 1, 9, 2, 10, 3, 11); return true; case ALTIVEC_BUILTIN_VMRGHW: Result = BuildVectorShuffle(Ops[0], Ops[1], 0, 4, 1, 5); return true; case ALTIVEC_BUILTIN_VMRGLB: Result = BuildVectorShuffle(Ops[0], Ops[1], 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return true; case ALTIVEC_BUILTIN_VMRGLH: Result = BuildVectorShuffle(Ops[0], Ops[1], 4, 12, 5, 13, 6, 14, 7, 15); return true; case ALTIVEC_BUILTIN_VMRGLW: Result = BuildVectorShuffle(Ops[0], Ops[1], 2, 6, 3, 7); return true; case ALTIVEC_BUILTIN_ABS_V4SF: { // and out sign bits VectorType *v4i32 = VectorType::get(Type::Int32Ty, 4); Ops[0] = Builder.CreateBitCast(Ops[0], v4i32, "tmp"); Constant *C = ConstantInt::get(Type::Int32Ty, 0x7FFFFFFF); C = ConstantVector::get(std::vector<Constant*>(4, C)); Result = Builder.CreateAnd(Ops[0], C, "tmp"); Result = Builder.CreateBitCast(Result, ResultType, "tmp"); return true; } case ALTIVEC_BUILTIN_ABS_V4SI: case ALTIVEC_BUILTIN_ABS_V8HI: case ALTIVEC_BUILTIN_ABS_V16QI: { // iabs(x) -> smax(x, 0-x) Result = Builder.CreateNeg(Ops[0], "tmp"); // get the right smax intrinsic. static const Intrinsic::ID smax_iid[3] = { Intrinsic::ppc_altivec_vmaxsw, Intrinsic::ppc_altivec_vmaxsh, Intrinsic::ppc_altivec_vmaxsb }; const VectorType *PTy = cast<VectorType>(ResultType); unsigned N = GetAltivecTypeNumFromType(PTy->getElementType()); Function *smax = Intrinsic::getDeclaration(TheModule, smax_iid[N]); Value *ActualOps[] = { Ops[0], Result }; Result = Builder.CreateCall(smax, ActualOps, ActualOps+2, "tmp"); return true; } case ALTIVEC_BUILTIN_ABSS_V4SI: case ALTIVEC_BUILTIN_ABSS_V8HI: case ALTIVEC_BUILTIN_ABSS_V16QI: { // iabss(x) -> smax(x, satsub(0,x)) // get the right smax/subs intrinsics. static const Intrinsic::ID smax_iid[3] = { Intrinsic::ppc_altivec_vmaxsw, Intrinsic::ppc_altivec_vmaxsh, Intrinsic::ppc_altivec_vmaxsb }; static const Intrinsic::ID subss_iid[3] = { Intrinsic::ppc_altivec_vsubsws, Intrinsic::ppc_altivec_vsubshs, Intrinsic::ppc_altivec_vsubsbs }; // get the right satsub intrinsic. const VectorType *PTy = cast<VectorType>(ResultType); unsigned N = GetAltivecTypeNumFromType(PTy->getElementType()); Function *smax = Intrinsic::getDeclaration(TheModule, smax_iid[N]); Function *subss = Intrinsic::getDeclaration(TheModule, subss_iid[N]); Value *ActualOps[] = { Constant::getNullValue(ResultType), Ops[0] }; Result = Builder.CreateCall(subss, ActualOps, ActualOps+2, "tmp"); ActualOps[0] = Ops[0]; ActualOps[1] = Result; Result = Builder.CreateCall(smax, ActualOps, ActualOps+2, "tmp"); return true; } case ALTIVEC_BUILTIN_VPERM_4SI: case ALTIVEC_BUILTIN_VPERM_4SF: case ALTIVEC_BUILTIN_VPERM_8HI: case ALTIVEC_BUILTIN_VPERM_16QI: { // Operation is identical on all types; we have a single intrinsic. const Type *VecTy = VectorType::get(Type::Int32Ty, 4); Value *Op0 = CastToType(Instruction::BitCast, Ops[0], VecTy); Value *Op1 = CastToType(Instruction::BitCast, Ops[1], VecTy); Value *ActualOps[] = { Op0, Op1, Ops[2]}; Result = Builder.CreateCall(Intrinsic::getDeclaration(TheModule, Intrinsic::ppc_altivec_vperm), ActualOps, ActualOps+3, "tmp"); Result = CastToType(Instruction::BitCast, Result, Ops[0]->getType()); return true; } case ALTIVEC_BUILTIN_VSEL_4SI: case ALTIVEC_BUILTIN_VSEL_4SF: case ALTIVEC_BUILTIN_VSEL_8HI: case ALTIVEC_BUILTIN_VSEL_16QI: { // Operation is identical on all types; we have a single intrinsic. const Type *VecTy = VectorType::get(Type::Int32Ty, 4); Value *Op0 = CastToType(Instruction::BitCast, Ops[0], VecTy); Value *Op1 = CastToType(Instruction::BitCast, Ops[1], VecTy); Value *Op2 = CastToType(Instruction::BitCast, Ops[2], VecTy); Value *ActualOps[] = { Op0, Op1, Op2 }; Result = Builder.CreateCall(Intrinsic::getDeclaration(TheModule, Intrinsic::ppc_altivec_vsel), ActualOps, ActualOps+3, "tmp"); Result = CastToType(Instruction::BitCast, Result, Ops[0]->getType()); return true; } } return false; }