void
MacroAssemblerX86::convertUInt64ToDouble(Register64 src, Register temp, FloatRegister dest)
{
    // SUBPD needs SSE2, HADDPD needs SSE3.
    if (!HasSSE3()) {
        convertUInt32ToDouble(src.high, dest);
        movePtr(ImmPtr(&TO_DOUBLE_HIGH_SCALE), temp);
        loadDouble(Address(temp, 0), ScratchDoubleReg);
        asMasm().mulDouble(ScratchDoubleReg, dest);
        convertUInt32ToDouble(src.low, ScratchDoubleReg);
        asMasm().addDouble(ScratchDoubleReg, dest);
        return;
    }

    // Following operation uses entire 128-bit of dest XMM register.
    // Currently higher 64-bit is free when we have access to lower 64-bit.
    MOZ_ASSERT(dest.size() == 8);
    FloatRegister dest128 = FloatRegister(dest.encoding(), FloatRegisters::Simd128);

    // Assume that src is represented as following:
    //   src      = 0x HHHHHHHH LLLLLLLL

    // Move src to dest (=dest128) and ScratchInt32x4Reg (=scratch):
    //   dest     = 0x 00000000 00000000  00000000 LLLLLLLL
    //   scratch  = 0x 00000000 00000000  00000000 HHHHHHHH
    vmovd(src.low, dest128);
    vmovd(src.high, ScratchSimd128Reg);

    // Unpack and interleave dest and scratch to dest:
    //   dest     = 0x 00000000 00000000  HHHHHHHH LLLLLLLL
    vpunpckldq(ScratchSimd128Reg, dest128, dest128);

    // Unpack and interleave dest and a constant C1 to dest:
    //   C1       = 0x 00000000 00000000  45300000 43300000
    //   dest     = 0x 45300000 HHHHHHHH  43300000 LLLLLLLL
    // here, each 64-bit part of dest represents following double:
    //   HI(dest) = 0x 1.00000HHHHHHHH * 2**84 == 2**84 + 0x HHHHHHHH 00000000
    //   LO(dest) = 0x 1.00000LLLLLLLL * 2**52 == 2**52 + 0x 00000000 LLLLLLLL
    movePtr(ImmPtr(TO_DOUBLE), temp);
    vpunpckldq(Operand(temp, 0), dest128, dest128);

    // Subtract a constant C2 from dest, for each 64-bit part:
    //   C2       = 0x 45300000 00000000  43300000 00000000
    // here, each 64-bit part of C2 represents following double:
    //   HI(C2)   = 0x 1.0000000000000 * 2**84 == 2**84
    //   LO(C2)   = 0x 1.0000000000000 * 2**52 == 2**52
    // after the operation each 64-bit part of dest represents following:
    //   HI(dest) = double(0x HHHHHHHH 00000000)
    //   LO(dest) = double(0x 00000000 LLLLLLLL)
    vsubpd(Operand(temp, sizeof(uint64_t) * 2), dest128, dest128);

    // Add HI(dest) and LO(dest) in double and store it into LO(dest),
    //   LO(dest) = double(0x HHHHHHHH 00000000) + double(0x 00000000 LLLLLLLL)
    //            = double(0x HHHHHHHH LLLLLLLL)
    //            = double(src)
    vhaddpd(dest128, dest128);
}
void
MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set, LiveRegisterSet ignore)
{
    FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    unsigned numFpu = fpuSet.size();
    int32_t diffG = set.gprs().size() * sizeof(intptr_t);
    int32_t diffF = fpuSet.getPushSizeInBytes();
    const int32_t reservedG = diffG;
    const int32_t reservedF = diffF;

    for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); iter++) {
        FloatRegister reg = *iter;
        diffF -= reg.size();
        numFpu -= 1;
        if (ignore.has(reg))
            continue;

        Address spillAddress(StackPointer, diffF);
        if (reg.isDouble())
            loadDouble(spillAddress, reg);
        else if (reg.isSingle())
            loadFloat32(spillAddress, reg);
        else if (reg.isInt32x4())
            loadUnalignedInt32x4(spillAddress, reg);
        else if (reg.isFloat32x4())
            loadUnalignedFloat32x4(spillAddress, reg);
        else
            MOZ_CRASH("Unknown register type.");
    }
    freeStack(reservedF);
    MOZ_ASSERT(numFpu == 0);
    // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
    // GetPushBytesInSize.
    diffF -= diffF % sizeof(uintptr_t);
    MOZ_ASSERT(diffF == 0);

    // On x86, use pop to pop the integer registers, if we're not going to
    // ignore any slots, as it's fast on modern hardware and it's a small
    // instruction.
    if (ignore.emptyGeneral()) {
        for (GeneralRegisterForwardIterator iter(set.gprs()); iter.more(); iter++) {
            diffG -= sizeof(intptr_t);
            Pop(*iter);
        }
    } else {
        for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); iter++) {
            diffG -= sizeof(intptr_t);
            if (!ignore.has(*iter))
                loadPtr(Address(StackPointer, diffG), *iter);
        }
        freeStack(reservedG);
    }
    MOZ_ASSERT(diffG == 0);
}
void
MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest, Register)
{
    FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    unsigned numFpu = fpuSet.size();
    int32_t diffF = fpuSet.getPushSizeInBytes();
    int32_t diffG = set.gprs().size() * sizeof(intptr_t);

    MOZ_ASSERT(dest.offset >= diffG + diffF);

    for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
        diffG -= sizeof(intptr_t);
        dest.offset -= sizeof(intptr_t);
        storePtr(*iter, dest);
    }
    MOZ_ASSERT(diffG == 0);

    for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
        FloatRegister reg = *iter;
        diffF -= reg.size();
        numFpu -= 1;
        dest.offset -= reg.size();
        if (reg.isDouble())
            storeDouble(reg, dest);
        else if (reg.isSingle())
            storeFloat32(reg, dest);
        else if (reg.isSimd128())
            storeUnalignedSimd128Float(reg, dest);
        else
            MOZ_CRASH("Unknown register type.");
    }
    MOZ_ASSERT(numFpu == 0);
    // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
    // GetPushBytesInSize.
    diffF -= diffF % sizeof(uintptr_t);
    MOZ_ASSERT(diffF == 0);
}
void
MacroAssembler::PushRegsInMask(LiveRegisterSet set)
{
    FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    unsigned numFpu = fpuSet.size();
    int32_t diffF = fpuSet.getPushSizeInBytes();
    int32_t diffG = set.gprs().size() * sizeof(intptr_t);

    // On x86, always use push to push the integer registers, as it's fast
    // on modern hardware and it's a small instruction.
    for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); iter++) {
        diffG -= sizeof(intptr_t);
        Push(*iter);
    }
    MOZ_ASSERT(diffG == 0);

    reserveStack(diffF);
    for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); iter++) {
        FloatRegister reg = *iter;
        diffF -= reg.size();
        numFpu -= 1;
        Address spillAddress(StackPointer, diffF);
        if (reg.isDouble())
            storeDouble(reg, spillAddress);
        else if (reg.isSingle())
            storeFloat32(reg, spillAddress);
        else if (reg.isInt32x4())
            storeUnalignedInt32x4(reg, spillAddress);
        else if (reg.isFloat32x4())
            storeUnalignedFloat32x4(reg, spillAddress);
        else
            MOZ_CRASH("Unknown register type.");
    }
    MOZ_ASSERT(numFpu == 0);
    // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
    // GetPushBytesInSize.
    diffF -= diffF % sizeof(uintptr_t);
    MOZ_ASSERT(diffF == 0);
}