示例#1
0
文件: cgxmm.c 项目: dsagal/dmd
code *orthxmm(elem *e, regm_t *pretregs)
{   elem *e1 = e->E1;
    elem *e2 = e->E2;
    regm_t retregs = *pretregs & XMMREGS;
    if (!retregs)
        retregs = XMMREGS;
    code *c = codelem(e1,&retregs,FALSE); // eval left leaf
    unsigned reg = findreg(retregs);
    regm_t rretregs = XMMREGS & ~retregs;
    code *cr = scodelem(e2, &rretregs, retregs, TRUE);  // eval right leaf

    unsigned op = xmmoperator(e1->Ety, e->Eoper);
    unsigned rreg = findreg(rretregs);

    // float + ifloat is not actually addition
    if ((e->Eoper == OPadd || e->Eoper == OPmin) &&
        ((tyreal(e1->Ety) && tyimaginary(e2->Ety)) ||
         (tyreal(e2->Ety) && tyimaginary(e1->Ety))))
    {
        retregs |= rretregs;
        c = cat(c, cr);
        if (e->Eoper == OPmin)
        {
            unsigned nretregs = XMMREGS & ~retregs;
            unsigned sreg; // hold sign bit
            unsigned sz = tysize[tybasic(e1->Ety)];
            c = cat(c,allocreg(&nretregs,&sreg,e2->Ety));
            targ_size_t signbit = 0x80000000;
            if (sz == 8)
                signbit = 0x8000000000000000LL;
            c = cat(c, movxmmconst(sreg, sz, signbit, 0));
            c = cat(c, getregs(nretregs));
            unsigned xop = (sz == 8) ? XORPD : XORPS;       // XORPD/S rreg,sreg
            c = cat(c, gen2(CNIL,xop,modregxrmx(3,rreg-XMM0,sreg-XMM0)));
        }
        if (retregs != *pretregs)
            c = cat(c, fixresult(e,retregs,pretregs));
        return c;
    }

    /* We should take advantage of mem addressing modes for OP XMM,MEM
     * but we do not at the moment.
     */
    code *cg;
    if (OTrel(e->Eoper))
    {
        retregs = mPSW;
        cg = NULL;
        code *cc = gen2(CNIL,op,modregxrmx(3,rreg-XMM0,reg-XMM0));
        return cat4(c,cr,cg,cc);
    }
    else
        cg = getregs(retregs);

    code *co = gen2(CNIL,op,modregxrmx(3,reg-XMM0,rreg-XMM0));
    if (retregs != *pretregs)
        co = cat(co,fixresult(e,retregs,pretregs));

    return cat4(c,cr,cg,co);
}
示例#2
0
文件: cgxmm.c 项目: davispuh/dmd
code *orthxmm(elem *e, regm_t *pretregs)
{
    //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
    elem *e1 = e->E1;
    elem *e2 = e->E2;

    // float + ifloat is not actually addition
    if ((e->Eoper == OPadd || e->Eoper == OPmin) &&
        ((tyreal(e1->Ety) && tyimaginary(e2->Ety)) ||
         (tyreal(e2->Ety) && tyimaginary(e1->Ety))))
    {
        regm_t retregs = *pretregs & XMMREGS;
        if (!retregs)
            retregs = XMMREGS;

        unsigned reg;
        regm_t rretregs;
        unsigned rreg;
        if (tyreal(e1->Ety))
        {
            reg = findreg(retregs);
            rreg = findreg(retregs & ~mask[reg]);
            retregs = mask[reg];
            rretregs = mask[rreg];
        }
        else
        {
            // Pick the second register, not the first
            rreg = findreg(retregs);
            rretregs = mask[rreg];
            reg = findreg(retregs & ~rretregs);
            retregs = mask[reg];
        }
        assert(retregs && rretregs);

        CodeBuilder cdb;
        cdb.append(codelem(e1,&retregs,FALSE)); // eval left leaf
        cdb.append(scodelem(e2, &rretregs, retregs, TRUE));  // eval right leaf

        retregs |= rretregs;
        if (e->Eoper == OPmin)
        {
            unsigned nretregs = XMMREGS & ~retregs;
            unsigned sreg; // hold sign bit
            unsigned sz = tysize(e1->Ety);
            cdb.append(allocreg(&nretregs,&sreg,e2->Ety));
            targ_size_t signbit = 0x80000000;
            if (sz == 8)
                signbit = 0x8000000000000000LL;
            cdb.append(movxmmconst(sreg, sz, signbit, 0));
            cdb.append(getregs(nretregs));
            unsigned xop = (sz == 8) ? XORPD : XORPS;       // XORPD/S rreg,sreg
            cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0));
        }
        if (retregs != *pretregs)
            cdb.append(fixresult(e,retregs,pretregs));
        return cdb.finish();
    }

    regm_t retregs = *pretregs & XMMREGS;
    if (!retregs)
        retregs = XMMREGS;
    CodeBuilder cdb;
    cdb.append(codelem(e1,&retregs,FALSE)); // eval left leaf
    unsigned reg = findreg(retregs);
    regm_t rretregs = XMMREGS & ~retregs;
    cdb.append(scodelem(e2, &rretregs, retregs, TRUE));  // eval right leaf

    unsigned rreg = findreg(rretregs);
    unsigned op = xmmoperator(e1->Ety, e->Eoper);

    /* We should take advantage of mem addressing modes for OP XMM,MEM
     * but we do not at the moment.
     */
    if (OTrel(e->Eoper))
    {
        retregs = mPSW;
        cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0));
        checkSetVex(cdb.last(), e1->Ety);
        return cdb.finish();
    }
    else
        cdb.append(getregs(retregs));

    cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
    checkSetVex(cdb.last(), e1->Ety);
    if (retregs != *pretregs)
        cdb.append(fixresult(e,retregs,pretregs));

    return cdb.finish();
}
示例#3
0
文件: cgxmm.c 项目: davispuh/dmd
code *xmmeq(elem *e, unsigned op, elem *e1, elem *e2,regm_t *pretregs)
{
    tym_t tymll;
    unsigned reg;
    int i;
    code cs;
    elem *e11;
    bool regvar;                  /* TRUE means evaluate into register variable */
    regm_t varregm;
    unsigned varreg;
    targ_int postinc;

    //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs));
    int e2oper = e2->Eoper;
    tym_t tyml = tybasic(e1->Ety);              /* type of lvalue               */
    regm_t retregs = *pretregs;

    if (!(retregs & XMMREGS))
        retregs = XMMREGS;              // pick any XMM reg

    bool aligned = xmmIsAligned(e1);
    cs.Iop = (op == OPeq) ? xmmstore(tyml, aligned) : op;
    regvar = FALSE;
    varregm = 0;
    if (config.flags4 & CFG4optimized)
    {
        // Be careful of cases like (x = x+x+x). We cannot evaluate in
        // x if x is in a register.
        if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
            doinreg(e1->EV.sp.Vsym,e2) &&       // and we can compute directly into it
            varregm & XMMREGS
           )
        {   regvar = TRUE;
            retregs = varregm;
            reg = varreg;       /* evaluate directly in target register */
        }
    }
    if (*pretregs & mPSW && !EOP(e1))     // if evaluating e1 couldn't change flags
    {   // Be careful that this lines up with jmpopcode()
        retregs |= mPSW;
        *pretregs &= ~mPSW;
    }
    CodeBuilder cdb;
    cdb.append(scodelem(e2,&retregs,0,TRUE));    // get rvalue

    // Look for special case of (*p++ = ...), where p is a register variable
    if (e1->Eoper == OPind &&
        ((e11 = e1->E1)->Eoper == OPpostinc || e11->Eoper == OPpostdec) &&
        e11->E1->Eoper == OPvar &&
        e11->E1->EV.sp.Vsym->Sfl == FLreg
       )
    {
        postinc = e11->E2->EV.Vint;
        if (e11->Eoper == OPpostdec)
            postinc = -postinc;
        cdb.append(getlvalue(&cs,e11,RMstore | retregs));
        freenode(e11->E2);
    }
    else
    {   postinc = 0;
        cdb.append(getlvalue(&cs,e1,RMstore | retregs));       // get lvalue (cl == CNIL if regvar)
    }

    cdb.append(getregs_imm(regvar ? varregm : 0));

    reg = findreg(retregs & XMMREGS);
    cs.Irm |= modregrm(0,(reg - XMM0) & 7,0);
    if ((reg - XMM0) & 8)
        cs.Irex |= REX_R;

    // Do not generate mov from register onto itself
    if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0))))
    {
        cdb.gen(&cs);         // MOV EA+offset,reg
        if (op == OPeq)
            checkSetVex(cdb.last(), tyml);
    }

    if (e1->Ecount ||                     // if lvalue is a CSE or
        regvar)                           // rvalue can't be a CSE
    {
        cdb.append(getregs_imm(retregs));        // necessary if both lvalue and
                                        //  rvalue are CSEs (since a reg
                                        //  can hold only one e at a time)
        cssave(e1,retregs,EOP(e1));     // if lvalue is a CSE
    }

    cdb.append(fixresult(e,retregs,pretregs));
Lp:
    if (postinc)
    {
        int reg = findreg(idxregm(&cs));
        if (*pretregs & mPSW)
        {   // Use LEA to avoid touching the flags
            unsigned rm = cs.Irm & 7;
            if (cs.Irex & REX_B)
                rm |= 8;
            cdb.genc1(0x8D,buildModregrm(2,reg,rm),FLconst,postinc);
            if (tysize(e11->E1->Ety) == 8)
                code_orrex(cdb.last(), REX_W);
        }
        else if (I64)
        {
            cdb.genc2(0x81,modregrmx(3,0,reg),postinc);
            if (tysize(e11->E1->Ety) == 8)
                code_orrex(cdb.last(), REX_W);
        }
        else
        {
            if (postinc == 1)
                cdb.gen1(0x40 + reg);         // INC reg
            else if (postinc == -(targ_int)1)
                cdb.gen1(0x48 + reg);         // DEC reg
            else
            {
                cdb.genc2(0x81,modregrm(3,0,reg),postinc);
            }
        }
    }
    freenode(e1);
    return cdb.finish();
}
示例#4
0
文件: cgxmm.c 项目: davispuh/dmd
/***************
 * Generate code for OPvecfill (broadcast).
 * OPvecfill takes the single value in e1 and
 * fills the vector type with it.
 */
code *cdvecfill(elem *e, regm_t *pretregs)
{
    //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs));

    regm_t retregs = *pretregs & XMMREGS;
    if (!retregs)
        retregs = XMMREGS;

    CodeBuilder cdb;
    code *c;
    code cs;

    elem *e1 = e->E1;
#if 0
    if ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar)
    {
        cr = getlvalue(&cs, e1, RMload | retregs);     // get addressing mode
    }
    else
    {
        unsigned rretregs = XMMREGS & ~retregs;
        cr = scodelem(op2, &rretregs, retregs, TRUE);
        unsigned rreg = findreg(rretregs) - XMM0;
        cs.Irm = modregrm(3,0,rreg & 7);
        cs.Iflags = 0;
        cs.Irex = 0;
        if (rreg & 8)
            cs.Irex |= REX_B;
    }
#endif

    unsigned reg;
    unsigned rreg;
    unsigned varreg;
    regm_t varregm;
    tym_t ty = tybasic(e->Ety);
    switch (ty)
    {
        case TYfloat4:
        case TYfloat8:
            if (config.avx &&
                ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar && !isregvar(e1,&varregm,&varreg)) ||
                tysize(ty) == 32 && !isregvar(e1,&varregm,&varreg)
               )
            {
              Lint:
                if (e1->Eoper == OPvar)
                    e1->EV.sp.Vsym->Sflags &= ~GTregcand;

                // VBROADCASTSS XMM,MEM
                cdb.append(getlvalue(&cs, e1, 0));         // get addressing mode
                assert((cs.Irm & 0xC0) != 0xC0);           // AVX1 doesn't have register source operands
                cdb.append(allocreg(&retregs,&reg,ty));
                cs.Iop = VBROADCASTSS;
                cs.Irex &= ~REX_W;
                code_newreg(&cs,reg - XMM0);
                checkSetVex(&cs,ty);
                cdb.gen(&cs);
            }
            else
            {
                // SHUFPS XMM0,XMM0,0    0F C6 /r ib
                c = codelem(e1,&retregs,FALSE); // eval left leaf
                cdb.append(c);
                reg = findreg(retregs) - XMM0;
                cdb.append(getregs(retregs));
                cs.Iop = SHUFPS;
                cs.Irm = modregxrmx(3,reg,reg);
                cs.Iflags = 0;
                cs.IFL2 = FLconst;
                cs.IEV2.Vsize_t = 0;
                if (config.avx >= 2 || tysize(ty) == 32)
                {
                    // VBROADCASTSS XMM,XMM
                    cs.Iop = VBROADCASTSS;
                    checkSetVex(&cs, ty);
                }
                cdb.gen(&cs);
            }
            break;

        case TYdouble2:
        case TYdouble4:
            if (config.avx &&
                ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar && !isregvar(e1,&varregm,&varreg)) ||
                tysize(ty) == 32 && !isregvar(e1,&varregm,&varreg)
               )
            {
                if (e1->Eoper == OPvar)
                    e1->EV.sp.Vsym->Sflags &= ~GTregcand;

                // VBROADCASTSD XMM,MEM
                cdb.append(getlvalue(&cs, e1, 0));         // get addressing mode
                assert((cs.Irm & 0xC0) != 0xC0);           // AVX1 doesn't have register source operands
                cdb.append(allocreg(&retregs,&reg,ty));
                cs.Iop = VBROADCASTSD;
                cs.Irex &= ~REX_W;
                code_newreg(&cs,reg - XMM0);
                checkSetVex(&cs,ty);
                cdb.gen(&cs);
            }
            else
            {
                // UNPCKLPD XMM0,XMM0     66 0F 14 /r
                c = codelem(e1,&retregs,FALSE); // eval left leaf
                cdb.append(c);
                reg = findreg(retregs) - XMM0;
                cdb.append(getregs(retregs));
                cs.Iop = UNPCKLPD;
                cs.Irm = modregxrmx(3,reg,reg);
                cs.Iflags = 0;
                if (config.avx >= 2 || tysize(ty) == 32)
                {
                    // VBROADCASTSD XMM,XMM
                    cs.Iop = VBROADCASTSD;
                    checkSetVex(&cs, ty);
                }
                cdb.gen(&cs);
            }
            break;

        case TYschar16:
        case TYuchar16:
        case TYschar32:
        case TYuchar32:
        {
            /* MOVD      XMM0,r
             * PUNPCKLBW XMM0,XMM0
             * PUNPCKLWD XMM0,XMM0
             * PSHUFD    XMM0,XMM0,0
             */
            regm_t regm = ALLREGS;
            c = codelem(e1,&regm,FALSE); // eval left leaf
            cdb.append(c);
            unsigned r = findreg(regm);

            c = allocreg(&retregs,&reg, e->Ety);
            cdb.append(c);
            reg -= XMM0;
            cdb.gen2(LODD,modregxrmx(3,reg,r));     // MOVD reg,r
            checkSetVex(cdb.last(),TYschar16);

            cs.Iop = PUNPCKLBW;
            cs.Irm = modregxrmx(3,reg,reg);
            cs.Iflags = 0;
            cdb.gen(&cs);
            cs.Iop = PUNPCKLWD;
            cdb.gen(&cs);

            cs.Iop = PSHUFD;
            cs.IFL2 = FLconst;
            cs.IEV2.Vsize_t = 0;
            checkSetVex(&cs,TYschar16);
            cdb.gen(&cs);
            if (tysize(ty) == 32)
            {
                // VINSERTF128 YMM0,YMM0,XMM0,1
                cs.Iop = VINSERTF128;
                cs.Irm = modregxrmx(3,reg,reg);
                cs.Iflags = 0;
                cs.IFL2 = FLconst;
                cs.IEV2.Vsize_t = 1;
                checkSetVex(&cs,ty);
                cdb.gen(&cs);
            }
            break;
        }

        case TYshort8:
        case TYushort8:
        case TYshort16:
        case TYushort16:
        {
            regm_t regm = ALLREGS;
            c = codelem(e1,&regm,FALSE); // eval left leaf
            cdb.append(c);
            unsigned r = findreg(regm);

            if (config.avx || tysize(ty) == 32)
            {
                /*
                 * VPXOR XMM0,XMM0,XMM0
                 * VPINSRW XMM0,XMM0,r,0
                 * VPINSRW XMM0,XMM0,r,1
                 * VPINSRW XMM0,XMM0,r,2
                 * VPINSRW XMM0,XMM0,r,3
                 */
                cdb.append(allocreg(&retregs,&reg, ty));
                cdb.gen2(PXOR,modregxrmx(3,reg-XMM0,reg-XMM0));
                checkSetVex(cdb.last(), TYshort8);
                for (int i = 0; i < tysize(ty) / 4; ++i)
                {
                    cdb.genc2(PINSRW,modregxrmx(3,reg-XMM0,r),i);
                    checkSetVex(cdb.last(), TYshort8);
                }
                if (tysize(ty) == 32)
                {
                    // VINSERTF128 YMM0,YMM0,XMM0,1
                    cs.Iop = VINSERTF128;
                    cs.Irm = modregxrmx(3,reg-XMM0,reg-XMM0);
                    cs.Iflags = 0;
                    cs.IFL2 = FLconst;
                    cs.IEV2.Vsize_t = 1;
                    checkSetVex(&cs,ty);
                    cdb.gen(&cs);
                }
                else
                {
                    // VPSHUFD XMM0,XMM0,0
                    cs.Iop = PSHUFD;
                    cs.Irm = modregxrmx(3,reg-XMM0,reg-XMM0);
                    cs.Iflags = 0;
                    cs.IFL2 = FLconst;
                    cs.IEV2.Vsize_t = 0;
                    checkSetVex(&cs,ty);
                    cdb.gen(&cs);
                }
            }
            else
            {
                /* MOVD      XMM0,r
                 * PUNPCKLWD XMM0,XMM0
                 * PSHUFD    XMM0,XMM0,0
                 */
                c = allocreg(&retregs,&reg, e->Ety);
                cdb.append(c);
                reg -= XMM0;
                cdb.gen2(LODD,modregxrmx(3,reg,r));     // MOVD reg,r
                checkSetVex(cdb.last(),e->Ety);

                cs.Iop = PUNPCKLWD;
                cs.Irm = modregxrmx(3,reg,reg);
                cs.Iflags = 0;
                cdb.gen(&cs);

                cs.Iop = PSHUFD;
                cs.IFL2 = FLconst;
                cs.IEV2.Vsize_t = 0;
                cdb.gen(&cs);
            }
            break;
        }

        case TYlong8:
        case TYulong8:
        case TYlong4:
        case TYulong4:
        {
            if (config.avx &&
                ((e1->Eoper == OPind && !e1->Ecount) || e1->Eoper == OPvar && !isregvar(e1,&varregm,&varreg)) ||
                tysize(ty) == 32 && !isregvar(e1,&varregm,&varreg))
            {
                goto Lint;
            }
            /* MOVD      XMM1,r
             * PSHUFD    XMM0,XMM1,0
             */
            regm_t regm = ALLREGS;
            c = codelem(e1,&regm,FALSE); // eval left leaf
            cdb.append(c);
            unsigned r = findreg(regm);

            c = allocreg(&retregs,&reg, e->Ety);
            cdb.append(c);
            reg -= XMM0;
            cdb.gen2(LODD,modregxrmx(3,reg,r));     // MOVD reg,r

            cs.Iop = PSHUFD;
            cs.Irm = modregxrmx(3,reg,reg);
            cs.Iflags = 0;
            cs.IFL2 = FLconst;
            cs.IEV2.Vsize_t = 0;
            if (config.avx >= 2 || tysize(ty) == 32)
            {
                // VBROADCASTSS XMM,XMM
                cs.Iop = VBROADCASTSS;
                checkSetVex(&cs, ty);
            }
            cdb.gen(&cs);
            break;
        }

        case TYllong2:
        case TYullong2:
        case TYllong4:
        case TYullong4:
            if (config.avx || tysize(ty) >= 32)
            {
                if (e1->Eoper == OPvar)
                    e1->EV.sp.Vsym->Sflags &= ~GTregcand;

                // VMOVDDUP XMM,MEM
                cdb.append(getlvalue(&cs, e1, 0));         // get addressing mode
                if ((cs.Irm & 0xC0) == 0xC0)
                {
                    unsigned sreg = ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0));
                    regm_t sregm = XMMREGS;
                    cdb.append(fixresult(e1, mask[sreg], &sregm));
                    unsigned rmreg = findreg(sregm);
                    cs.Irm = (cs.Irm & ~7) | ((rmreg - XMM0) & 7);
                    if ((rmreg - XMM0) & 8)
                        cs.Irex |= REX_B;
                    else
                        cs.Irex &= ~REX_B;
                }
                cdb.append(allocreg(&retregs,&reg,ty));
                if (config.avx >= 2 ||  tysize(ty) >= 32)
                {
                    cs.Iop = VBROADCASTSD;
                    cs.Irex &= ~REX_W;
                }
                else
                    cs.Iop = MOVDDUP;
                code_newreg(&cs,reg - XMM0);
                checkSetVex(&cs,ty);
                cdb.gen(&cs);
            }
            else
            {
                /* MOVQ XMM0,mem128
                 * PUNPCKLQDQ XMM0,XMM0
                 */
                c = codelem(e1,&retregs,FALSE); // eval left leaf
                cdb.append(c);
                unsigned reg = findreg(retregs);
                reg -= XMM0;
                //cdb.gen2(LODD,modregxrmx(3,reg,r));     // MOVQ reg,r

                cs.Iop = PUNPCKLQDQ;
                cs.Irm = modregxrmx(3,reg,reg);
                cs.Iflags = 0;
                cdb.gen(&cs);
            }
            break;

        default:
            assert(0);
    }

    c = fixresult(e,retregs,pretregs);
    cdb.append(c);
    return cdb.finish();
}
示例#5
0
文件: cgxmm.c 项目: davispuh/dmd
code *cdvector(elem *e, regm_t *pretregs)
{
    /* e should look like one of:
     *    vector
     *      |
     *    param
     *    /   \
     *  param op2
     *  /   \
     * op   op1
     */

    if (!config.fpxmmregs)
    {   printf("SIMD operations not supported on this platform\n");
        exit(1);
    }

    unsigned n = el_nparams(e->E1);
    elem **params = (elem **)malloc(n * sizeof(elem *));
    assert(params);
    elem **tmp = params;
    el_paramArray(&tmp, e->E1);

#if 0
    printf("cdvector()\n");
    for (int i = 0; i < n; i++)
    {
        printf("[%d]: ", i);
        elem_print(params[i]);
    }
#endif

    if (*pretregs == 0)
    {   /* Evaluate for side effects only
         */
        CodeBuilder cdb;
        for (int i = 0; i < n; i++)
        {
            cdb.append(codelem(params[i], pretregs, FALSE));
            *pretregs = 0;      // in case they got set
        }
        return cdb.finish();
    }

    assert(n >= 2 && n <= 4);

    elem *eop = params[0];
    elem *op1 = params[1];
    elem *op2 = NULL;
    tym_t ty2 = 0;
    if (n >= 3)
    {   op2 = params[2];
        ty2 = tybasic(op2->Ety);
    }

    unsigned op = el_tolong(eop);
#ifdef DEBUG
    assert(!isXMMstore(op));
#endif
    tym_t ty1 = tybasic(op1->Ety);
    unsigned sz1 = _tysize[ty1];
//    assert(sz1 == 16);       // float or double

    regm_t retregs;
    CodeBuilder cdb;
    if (n == 3 && ty2 == TYuchar && op2->Eoper == OPconst)
    {   // Handle: op xmm,imm8

        retregs = *pretregs & XMMREGS;
        if (!retregs)
            retregs = XMMREGS;
        cdb.append(codelem(op1,&retregs,FALSE)); // eval left leaf
        unsigned reg = findreg(retregs);
        int r;
        switch (op)
        {
            case PSLLD:  r = 6; op = 0x660F72;  break;
            case PSLLQ:  r = 6; op = 0x660F73;  break;
            case PSLLW:  r = 6; op = 0x660F71;  break;
            case PSRAD:  r = 4; op = 0x660F72;  break;
            case PSRAW:  r = 4; op = 0x660F71;  break;
            case PSRLD:  r = 2; op = 0x660F72;  break;
            case PSRLQ:  r = 2; op = 0x660F73;  break;
            case PSRLW:  r = 2; op = 0x660F71;  break;
            case PSRLDQ: r = 3; op = 0x660F73;  break;
            case PSLLDQ: r = 7; op = 0x660F73;  break;

            default:
                printf("op = x%x\n", op);
                assert(0);
                break;
        }
        cdb.append(getregs(retregs));
        cdb.genc2(op,modregrmx(3,r,reg-XMM0), el_tolong(op2));
    }
    else if (n == 2)
    {   /* Handle: op xmm,mem
         * where xmm is written only, not read
         */
        code cs;

        if ((op1->Eoper == OPind && !op1->Ecount) || op1->Eoper == OPvar)
        {
            cdb.append(getlvalue(&cs, op1, RMload));     // get addressing mode
        }
        else
        {
            regm_t rretregs = XMMREGS;
            cdb.append(codelem(op1, &rretregs, FALSE));
            unsigned rreg = findreg(rretregs) - XMM0;
            cs.Irm = modregrm(3,0,rreg & 7);
            cs.Iflags = 0;
            cs.Irex = 0;
            if (rreg & 8)
                cs.Irex |= REX_B;
        }

        retregs = *pretregs & XMMREGS;
        if (!retregs)
            retregs = XMMREGS;
        unsigned reg;
        cdb.append(allocreg(&retregs, &reg, e->Ety));
        code_newreg(&cs, reg - XMM0);
        cs.Iop = op;
        cdb.gen(&cs);
    }
    else if (n == 3 || n == 4)
    {   /* Handle:
         *      op xmm,mem        // n = 3
         *      op xmm,mem,imm8   // n = 4
         * Both xmm and mem are operands, evaluate xmm first.
         */

        code cs;

        retregs = *pretregs & XMMREGS;
        if (!retregs)
            retregs = XMMREGS;
        cdb.append(codelem(op1,&retregs,FALSE)); // eval left leaf
        unsigned reg = findreg(retregs);

        if ((op2->Eoper == OPind && !op2->Ecount) || op2->Eoper == OPvar)
        {
            cdb.append(getlvalue(&cs, op2, RMload | retregs));     // get addressing mode
        }
        else
        {
            unsigned rretregs = XMMREGS & ~retregs;
            cdb.append(scodelem(op2, &rretregs, retregs, TRUE));
            unsigned rreg = findreg(rretregs) - XMM0;
            cs.Irm = modregrm(3,0,rreg & 7);
            cs.Iflags = 0;
            cs.Irex = 0;
            if (rreg & 8)
                cs.Irex |= REX_B;
        }

        cdb.append(getregs(retregs));
        if (n == 4)
        {
            switch (op)
            {
                case CMPPD:   case CMPSS:   case CMPSD:   case CMPPS:
                case PSHUFD:  case PSHUFHW: case PSHUFLW:
                case BLENDPD: case BLENDPS: case DPPD:    case DPPS:
                case MPSADBW: case PBLENDW:
                case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS:
                case SHUFPD:  case SHUFPS:
                    break;
                default:
                    printf("op = x%x\n", op);
                    assert(0);
                    break;
            }
            elem *imm8 = params[3];
            cs.IFL2 = FLconst;
            cs.IEV2.Vsize_t = el_tolong(imm8);
        }
        code_newreg(&cs, reg - XMM0);
        cs.Iop = op;
        cdb.gen(&cs);
    }
    else
        assert(0);
    cdb.append(fixresult(e,retregs,pretregs));
    free(params);
    freenode(e);
    return cdb.finish();
}