code *orthxmm(elem *e, regm_t *pretregs) { elem *e1 = e->E1; elem *e2 = e->E2; regm_t retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; code *c = codelem(e1,&retregs,FALSE); // eval left leaf unsigned reg = findreg(retregs); regm_t rretregs = XMMREGS & ~retregs; code *cr = scodelem(e2, &rretregs, retregs, TRUE); // eval right leaf unsigned op = xmmoperator(e1->Ety, e->Eoper); unsigned rreg = findreg(rretregs); // float + ifloat is not actually addition if ((e->Eoper == OPadd || e->Eoper == OPmin) && ((tyreal(e1->Ety) && tyimaginary(e2->Ety)) || (tyreal(e2->Ety) && tyimaginary(e1->Ety)))) { retregs |= rretregs; c = cat(c, cr); if (e->Eoper == OPmin) { unsigned nretregs = XMMREGS & ~retregs; unsigned sreg; // hold sign bit unsigned sz = tysize[tybasic(e1->Ety)]; c = cat(c,allocreg(&nretregs,&sreg,e2->Ety)); targ_size_t signbit = 0x80000000; if (sz == 8) signbit = 0x8000000000000000LL; c = cat(c, movxmmconst(sreg, sz, signbit, 0)); c = cat(c, getregs(nretregs)); unsigned xop = (sz == 8) ? XORPD : XORPS; // XORPD/S rreg,sreg c = cat(c, gen2(CNIL,xop,modregxrmx(3,rreg-XMM0,sreg-XMM0))); } if (retregs != *pretregs) c = cat(c, fixresult(e,retregs,pretregs)); return c; } /* We should take advantage of mem addressing modes for OP XMM,MEM * but we do not at the moment. */ code *cg; if (OTrel(e->Eoper)) { retregs = mPSW; cg = NULL; code *cc = gen2(CNIL,op,modregxrmx(3,rreg-XMM0,reg-XMM0)); return cat4(c,cr,cg,cc); } else cg = getregs(retregs); code *co = gen2(CNIL,op,modregxrmx(3,reg-XMM0,rreg-XMM0)); if (retregs != *pretregs) co = cat(co,fixresult(e,retregs,pretregs)); return cat4(c,cr,cg,co); }
code *orthxmm(elem *e, regm_t *pretregs) { //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); elem *e1 = e->E1; elem *e2 = e->E2; // float + ifloat is not actually addition if ((e->Eoper == OPadd || e->Eoper == OPmin) && ((tyreal(e1->Ety) && tyimaginary(e2->Ety)) || (tyreal(e2->Ety) && tyimaginary(e1->Ety)))) { regm_t retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; unsigned reg; regm_t rretregs; unsigned rreg; if (tyreal(e1->Ety)) { reg = findreg(retregs); rreg = findreg(retregs & ~mask[reg]); retregs = mask[reg]; rretregs = mask[rreg]; } else { // Pick the second register, not the first rreg = findreg(retregs); rretregs = mask[rreg]; reg = findreg(retregs & ~rretregs); retregs = mask[reg]; } assert(retregs && rretregs); CodeBuilder cdb; cdb.append(codelem(e1,&retregs,FALSE)); // eval left leaf cdb.append(scodelem(e2, &rretregs, retregs, TRUE)); // eval right leaf retregs |= rretregs; if (e->Eoper == OPmin) { unsigned nretregs = XMMREGS & ~retregs; unsigned sreg; // hold sign bit unsigned sz = tysize(e1->Ety); cdb.append(allocreg(&nretregs,&sreg,e2->Ety)); targ_size_t signbit = 0x80000000; if (sz == 8) signbit = 0x8000000000000000LL; cdb.append(movxmmconst(sreg, sz, signbit, 0)); cdb.append(getregs(nretregs)); unsigned xop = (sz == 8) ? XORPD : XORPS; // XORPD/S rreg,sreg cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0)); } if (retregs != *pretregs) cdb.append(fixresult(e,retregs,pretregs)); return cdb.finish(); } regm_t retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; CodeBuilder cdb; cdb.append(codelem(e1,&retregs,FALSE)); // eval left leaf unsigned reg = findreg(retregs); regm_t rretregs = XMMREGS & ~retregs; cdb.append(scodelem(e2, &rretregs, retregs, TRUE)); // eval right leaf unsigned rreg = findreg(rretregs); unsigned op = xmmoperator(e1->Ety, e->Eoper); /* We should take advantage of mem addressing modes for OP XMM,MEM * but we do not at the moment. */ if (OTrel(e->Eoper)) { retregs = mPSW; cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0)); checkSetVex(cdb.last(), e1->Ety); return cdb.finish(); } else cdb.append(getregs(retregs)); cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); checkSetVex(cdb.last(), e1->Ety); if (retregs != *pretregs) cdb.append(fixresult(e,retregs,pretregs)); return cdb.finish(); }
code *xmmeq(elem *e, unsigned op, elem *e1, elem *e2,regm_t *pretregs) { tym_t tymll; unsigned reg; int i; code cs; elem *e11; bool regvar; /* TRUE means evaluate into register variable */ regm_t varregm; unsigned varreg; targ_int postinc; //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs)); int e2oper = e2->Eoper; tym_t tyml = tybasic(e1->Ety); /* type of lvalue */ regm_t retregs = *pretregs; if (!(retregs & XMMREGS)) retregs = XMMREGS; // pick any XMM reg bool aligned = xmmIsAligned(e1); cs.Iop = (op == OPeq) ? xmmstore(tyml, aligned) : op; regvar = FALSE; varregm = 0; if (config.flags4 & CFG4optimized) { // Be careful of cases like (x = x+x+x). We cannot evaluate in // x if x is in a register. if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable doinreg(e1->EV.sp.Vsym,e2) && // and we can compute directly into it varregm & XMMREGS ) { regvar = TRUE; retregs = varregm; reg = varreg; /* evaluate directly in target register */ } } if (*pretregs & mPSW && !EOP(e1)) // if evaluating e1 couldn't change flags { // Be careful that this lines up with jmpopcode() retregs |= mPSW; *pretregs &= ~mPSW; } CodeBuilder cdb; cdb.append(scodelem(e2,&retregs,0,TRUE)); // get rvalue // Look for special case of (*p++ = ...), where p is a register variable if (e1->Eoper == OPind && ((e11 = e1->E1)->Eoper == OPpostinc || e11->Eoper == OPpostdec) && e11->E1->Eoper == OPvar && e11->E1->EV.sp.Vsym->Sfl == FLreg ) { postinc = e11->E2->EV.Vint; if (e11->Eoper == OPpostdec) postinc = -postinc; cdb.append(getlvalue(&cs,e11,RMstore | retregs)); freenode(e11->E2); } else { postinc = 0; cdb.append(getlvalue(&cs,e1,RMstore | retregs)); // get lvalue (cl == CNIL if regvar) } cdb.append(getregs_imm(regvar ? varregm : 0)); reg = findreg(retregs & XMMREGS); cs.Irm |= modregrm(0,(reg - XMM0) & 7,0); if ((reg - XMM0) & 8) cs.Irex |= REX_R; // Do not generate mov from register onto itself if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0)))) { cdb.gen(&cs); // MOV EA+offset,reg if (op == OPeq) checkSetVex(cdb.last(), tyml); } if (e1->Ecount || // if lvalue is a CSE or regvar) // rvalue can't be a CSE { cdb.append(getregs_imm(retregs)); // necessary if both lvalue and // rvalue are CSEs (since a reg // can hold only one e at a time) cssave(e1,retregs,EOP(e1)); // if lvalue is a CSE } cdb.append(fixresult(e,retregs,pretregs)); Lp: if (postinc) { int reg = findreg(idxregm(&cs)); if (*pretregs & mPSW) { // Use LEA to avoid touching the flags unsigned rm = cs.Irm & 7; if (cs.Irex & REX_B) rm |= 8; cdb.genc1(0x8D,buildModregrm(2,reg,rm),FLconst,postinc); if (tysize(e11->E1->Ety) == 8) code_orrex(cdb.last(), REX_W); } else if (I64) { cdb.genc2(0x81,modregrmx(3,0,reg),postinc); if (tysize(e11->E1->Ety) == 8) code_orrex(cdb.last(), REX_W); } else { if (postinc == 1) cdb.gen1(0x40 + reg); // INC reg else if (postinc == -(targ_int)1) cdb.gen1(0x48 + reg); // DEC reg else { cdb.genc2(0x81,modregrm(3,0,reg),postinc); } } } freenode(e1); return cdb.finish(); }
/*************** * Generate code for OPvecfill (broadcast). * OPvecfill takes the single value in e1 and * fills the vector type with it. */ code *cdvecfill(elem *e, regm_t *pretregs) { //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs)); regm_t retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; CodeBuilder cdb; code *c; code cs; elem *e1 = e->E1; #if 0 if ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar) { cr = getlvalue(&cs, e1, RMload | retregs); // get addressing mode } else { unsigned rretregs = XMMREGS & ~retregs; cr = scodelem(op2, &rretregs, retregs, TRUE); unsigned rreg = findreg(rretregs) - XMM0; cs.Irm = modregrm(3,0,rreg & 7); cs.Iflags = 0; cs.Irex = 0; if (rreg & 8) cs.Irex |= REX_B; } #endif unsigned reg; unsigned rreg; unsigned varreg; regm_t varregm; tym_t ty = tybasic(e->Ety); switch (ty) { case TYfloat4: case TYfloat8: if (config.avx && ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar && !isregvar(e1,&varregm,&varreg)) || tysize(ty) == 32 && !isregvar(e1,&varregm,&varreg) ) { Lint: if (e1->Eoper == OPvar) e1->EV.sp.Vsym->Sflags &= ~GTregcand; // VBROADCASTSS XMM,MEM cdb.append(getlvalue(&cs, e1, 0)); // get addressing mode assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands cdb.append(allocreg(&retregs,®,ty)); cs.Iop = VBROADCASTSS; cs.Irex &= ~REX_W; code_newreg(&cs,reg - XMM0); checkSetVex(&cs,ty); cdb.gen(&cs); } else { // SHUFPS XMM0,XMM0,0 0F C6 /r ib c = codelem(e1,&retregs,FALSE); // eval left leaf cdb.append(c); reg = findreg(retregs) - XMM0; cdb.append(getregs(retregs)); cs.Iop = SHUFPS; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 0; if (config.avx >= 2 || tysize(ty) == 32) { // VBROADCASTSS XMM,XMM cs.Iop = VBROADCASTSS; checkSetVex(&cs, ty); } cdb.gen(&cs); } break; case TYdouble2: case TYdouble4: if (config.avx && ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar && !isregvar(e1,&varregm,&varreg)) || tysize(ty) == 32 && !isregvar(e1,&varregm,&varreg) ) { if (e1->Eoper == OPvar) e1->EV.sp.Vsym->Sflags &= ~GTregcand; // VBROADCASTSD XMM,MEM cdb.append(getlvalue(&cs, e1, 0)); // get addressing mode assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands cdb.append(allocreg(&retregs,®,ty)); cs.Iop = VBROADCASTSD; cs.Irex &= ~REX_W; code_newreg(&cs,reg - XMM0); checkSetVex(&cs,ty); cdb.gen(&cs); } else { // UNPCKLPD XMM0,XMM0 66 0F 14 /r c = codelem(e1,&retregs,FALSE); // eval left leaf cdb.append(c); reg = findreg(retregs) - XMM0; cdb.append(getregs(retregs)); cs.Iop = UNPCKLPD; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; if (config.avx >= 2 || tysize(ty) == 32) { // VBROADCASTSD XMM,XMM cs.Iop = VBROADCASTSD; checkSetVex(&cs, ty); } cdb.gen(&cs); } break; case TYschar16: case TYuchar16: case TYschar32: case TYuchar32: { /* MOVD XMM0,r * PUNPCKLBW XMM0,XMM0 * PUNPCKLWD XMM0,XMM0 * PSHUFD XMM0,XMM0,0 */ regm_t regm = ALLREGS; c = codelem(e1,®m,FALSE); // eval left leaf cdb.append(c); unsigned r = findreg(regm); c = allocreg(&retregs,®, e->Ety); cdb.append(c); reg -= XMM0; cdb.gen2(LODD,modregxrmx(3,reg,r)); // MOVD reg,r checkSetVex(cdb.last(),TYschar16); cs.Iop = PUNPCKLBW; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; cdb.gen(&cs); cs.Iop = PUNPCKLWD; cdb.gen(&cs); cs.Iop = PSHUFD; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 0; checkSetVex(&cs,TYschar16); cdb.gen(&cs); if (tysize(ty) == 32) { // VINSERTF128 YMM0,YMM0,XMM0,1 cs.Iop = VINSERTF128; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 1; checkSetVex(&cs,ty); cdb.gen(&cs); } break; } case TYshort8: case TYushort8: case TYshort16: case TYushort16: { regm_t regm = ALLREGS; c = codelem(e1,®m,FALSE); // eval left leaf cdb.append(c); unsigned r = findreg(regm); if (config.avx || tysize(ty) == 32) { /* * VPXOR XMM0,XMM0,XMM0 * VPINSRW XMM0,XMM0,r,0 * VPINSRW XMM0,XMM0,r,1 * VPINSRW XMM0,XMM0,r,2 * VPINSRW XMM0,XMM0,r,3 */ cdb.append(allocreg(&retregs,®, ty)); cdb.gen2(PXOR,modregxrmx(3,reg-XMM0,reg-XMM0)); checkSetVex(cdb.last(), TYshort8); for (int i = 0; i < tysize(ty) / 4; ++i) { cdb.genc2(PINSRW,modregxrmx(3,reg-XMM0,r),i); checkSetVex(cdb.last(), TYshort8); } if (tysize(ty) == 32) { // VINSERTF128 YMM0,YMM0,XMM0,1 cs.Iop = VINSERTF128; cs.Irm = modregxrmx(3,reg-XMM0,reg-XMM0); cs.Iflags = 0; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 1; checkSetVex(&cs,ty); cdb.gen(&cs); } else { // VPSHUFD XMM0,XMM0,0 cs.Iop = PSHUFD; cs.Irm = modregxrmx(3,reg-XMM0,reg-XMM0); cs.Iflags = 0; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 0; checkSetVex(&cs,ty); cdb.gen(&cs); } } else { /* MOVD XMM0,r * PUNPCKLWD XMM0,XMM0 * PSHUFD XMM0,XMM0,0 */ c = allocreg(&retregs,®, e->Ety); cdb.append(c); reg -= XMM0; cdb.gen2(LODD,modregxrmx(3,reg,r)); // MOVD reg,r checkSetVex(cdb.last(),e->Ety); cs.Iop = PUNPCKLWD; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; cdb.gen(&cs); cs.Iop = PSHUFD; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 0; cdb.gen(&cs); } break; } case TYlong8: case TYulong8: case TYlong4: case TYulong4: { if (config.avx && ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar && !isregvar(e1,&varregm,&varreg)) || tysize(ty) == 32 && !isregvar(e1,&varregm,&varreg)) { goto Lint; } /* MOVD XMM1,r * PSHUFD XMM0,XMM1,0 */ regm_t regm = ALLREGS; c = codelem(e1,®m,FALSE); // eval left leaf cdb.append(c); unsigned r = findreg(regm); c = allocreg(&retregs,®, e->Ety); cdb.append(c); reg -= XMM0; cdb.gen2(LODD,modregxrmx(3,reg,r)); // MOVD reg,r cs.Iop = PSHUFD; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = 0; if (config.avx >= 2 || tysize(ty) == 32) { // VBROADCASTSS XMM,XMM cs.Iop = VBROADCASTSS; checkSetVex(&cs, ty); } cdb.gen(&cs); break; } case TYllong2: case TYullong2: case TYllong4: case TYullong4: if (config.avx || tysize(ty) >= 32) { if (e1->Eoper == OPvar) e1->EV.sp.Vsym->Sflags &= ~GTregcand; // VMOVDDUP XMM,MEM cdb.append(getlvalue(&cs, e1, 0)); // get addressing mode if ((cs.Irm & 0xC0) == 0xC0) { unsigned sreg = ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0)); regm_t sregm = XMMREGS; cdb.append(fixresult(e1, mask[sreg], &sregm)); unsigned rmreg = findreg(sregm); cs.Irm = (cs.Irm & ~7) | ((rmreg - XMM0) & 7); if ((rmreg - XMM0) & 8) cs.Irex |= REX_B; else cs.Irex &= ~REX_B; } cdb.append(allocreg(&retregs,®,ty)); if (config.avx >= 2 || tysize(ty) >= 32) { cs.Iop = VBROADCASTSD; cs.Irex &= ~REX_W; } else cs.Iop = MOVDDUP; code_newreg(&cs,reg - XMM0); checkSetVex(&cs,ty); cdb.gen(&cs); } else { /* MOVQ XMM0,mem128 * PUNPCKLQDQ XMM0,XMM0 */ c = codelem(e1,&retregs,FALSE); // eval left leaf cdb.append(c); unsigned reg = findreg(retregs); reg -= XMM0; //cdb.gen2(LODD,modregxrmx(3,reg,r)); // MOVQ reg,r cs.Iop = PUNPCKLQDQ; cs.Irm = modregxrmx(3,reg,reg); cs.Iflags = 0; cdb.gen(&cs); } break; default: assert(0); } c = fixresult(e,retregs,pretregs); cdb.append(c); return cdb.finish(); }
code *cdvector(elem *e, regm_t *pretregs) { /* e should look like one of: * vector * | * param * / \ * param op2 * / \ * op op1 */ if (!config.fpxmmregs) { printf("SIMD operations not supported on this platform\n"); exit(1); } unsigned n = el_nparams(e->E1); elem **params = (elem **)malloc(n * sizeof(elem *)); assert(params); elem **tmp = params; el_paramArray(&tmp, e->E1); #if 0 printf("cdvector()\n"); for (int i = 0; i < n; i++) { printf("[%d]: ", i); elem_print(params[i]); } #endif if (*pretregs == 0) { /* Evaluate for side effects only */ CodeBuilder cdb; for (int i = 0; i < n; i++) { cdb.append(codelem(params[i], pretregs, FALSE)); *pretregs = 0; // in case they got set } return cdb.finish(); } assert(n >= 2 && n <= 4); elem *eop = params[0]; elem *op1 = params[1]; elem *op2 = NULL; tym_t ty2 = 0; if (n >= 3) { op2 = params[2]; ty2 = tybasic(op2->Ety); } unsigned op = el_tolong(eop); #ifdef DEBUG assert(!isXMMstore(op)); #endif tym_t ty1 = tybasic(op1->Ety); unsigned sz1 = _tysize[ty1]; // assert(sz1 == 16); // float or double regm_t retregs; CodeBuilder cdb; if (n == 3 && ty2 == TYuchar && op2->Eoper == OPconst) { // Handle: op xmm,imm8 retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; cdb.append(codelem(op1,&retregs,FALSE)); // eval left leaf unsigned reg = findreg(retregs); int r; switch (op) { case PSLLD: r = 6; op = 0x660F72; break; case PSLLQ: r = 6; op = 0x660F73; break; case PSLLW: r = 6; op = 0x660F71; break; case PSRAD: r = 4; op = 0x660F72; break; case PSRAW: r = 4; op = 0x660F71; break; case PSRLD: r = 2; op = 0x660F72; break; case PSRLQ: r = 2; op = 0x660F73; break; case PSRLW: r = 2; op = 0x660F71; break; case PSRLDQ: r = 3; op = 0x660F73; break; case PSLLDQ: r = 7; op = 0x660F73; break; default: printf("op = x%x\n", op); assert(0); break; } cdb.append(getregs(retregs)); cdb.genc2(op,modregrmx(3,r,reg-XMM0), el_tolong(op2)); } else if (n == 2) { /* Handle: op xmm,mem * where xmm is written only, not read */ code cs; if ((op1->Eoper == OPind && !op1->Ecount) || op1->Eoper == OPvar) { cdb.append(getlvalue(&cs, op1, RMload)); // get addressing mode } else { regm_t rretregs = XMMREGS; cdb.append(codelem(op1, &rretregs, FALSE)); unsigned rreg = findreg(rretregs) - XMM0; cs.Irm = modregrm(3,0,rreg & 7); cs.Iflags = 0; cs.Irex = 0; if (rreg & 8) cs.Irex |= REX_B; } retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; unsigned reg; cdb.append(allocreg(&retregs, ®, e->Ety)); code_newreg(&cs, reg - XMM0); cs.Iop = op; cdb.gen(&cs); } else if (n == 3 || n == 4) { /* Handle: * op xmm,mem // n = 3 * op xmm,mem,imm8 // n = 4 * Both xmm and mem are operands, evaluate xmm first. */ code cs; retregs = *pretregs & XMMREGS; if (!retregs) retregs = XMMREGS; cdb.append(codelem(op1,&retregs,FALSE)); // eval left leaf unsigned reg = findreg(retregs); if ((op2->Eoper == OPind && !op2->Ecount) || op2->Eoper == OPvar) { cdb.append(getlvalue(&cs, op2, RMload | retregs)); // get addressing mode } else { unsigned rretregs = XMMREGS & ~retregs; cdb.append(scodelem(op2, &rretregs, retregs, TRUE)); unsigned rreg = findreg(rretregs) - XMM0; cs.Irm = modregrm(3,0,rreg & 7); cs.Iflags = 0; cs.Irex = 0; if (rreg & 8) cs.Irex |= REX_B; } cdb.append(getregs(retregs)); if (n == 4) { switch (op) { case CMPPD: case CMPSS: case CMPSD: case CMPPS: case PSHUFD: case PSHUFHW: case PSHUFLW: case BLENDPD: case BLENDPS: case DPPD: case DPPS: case MPSADBW: case PBLENDW: case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS: case SHUFPD: case SHUFPS: break; default: printf("op = x%x\n", op); assert(0); break; } elem *imm8 = params[3]; cs.IFL2 = FLconst; cs.IEV2.Vsize_t = el_tolong(imm8); } code_newreg(&cs, reg - XMM0); cs.Iop = op; cdb.gen(&cs); } else assert(0); cdb.append(fixresult(e,retregs,pretregs)); free(params); freenode(e); return cdb.finish(); }