static TR::Register *l2fd(TR::Node *node, TR::RealRegister *target, TR_X86OpCodes opRegMem8, TR_X86OpCodes opRegReg8, TR::CodeGenerator *cg) { TR::Node *child = node->getFirstChild(); TR::MemoryReference *tempMR; TR_ASSERT(cg->useSSEForSinglePrecision(), "assertion failure"); if (child->getRegister() == NULL && child->getReferenceCount() == 1 && child->getOpCode().isLoadVar()) { tempMR = generateX86MemoryReference(child, cg); generateRegMemInstruction(opRegMem8, node, target, tempMR, cg); tempMR->decNodeReferenceCounts(cg); } else { TR::Register *intReg = cg->evaluate(child); generateRegRegInstruction(opRegReg8, node, target, intReg, cg); cg->decReferenceCount(child); } node->setRegister(target); return target; }
TR::Register* OMR::X86::TreeEvaluator::SIMDsplatsEvaluator(TR::Node* node, TR::CodeGenerator* cg) { TR::Node* childNode = node->getChild(0); TR::Register* childReg = cg->evaluate(childNode); TR::Register* resultReg = cg->allocateRegister(TR_VRF); switch (node->getDataType()) { case TR::VectorInt32: generateRegRegInstruction(MOVDRegReg4, node, resultReg, childReg, cg); generateRegRegImmInstruction(PSHUFDRegRegImm1, node, resultReg, resultReg, 0x00, cg); // 00 00 00 00 shuffle xxxA to AAAA break; case TR::VectorInt64: if (TR::Compiler->target.is32Bit()) { TR::Register* tempVectorReg = cg->allocateRegister(TR_VRF); generateRegRegInstruction(MOVDRegReg4, node, tempVectorReg, childReg->getHighOrder(), cg); generateRegImmInstruction(PSLLQRegImm1, node, tempVectorReg, 0x20, cg); generateRegRegInstruction(MOVDRegReg4, node, resultReg, childReg->getLowOrder(), cg); generateRegRegInstruction(PORRegReg, node, resultReg, tempVectorReg, cg); cg->stopUsingRegister(tempVectorReg); } else { generateRegRegInstruction(MOVQRegReg8, node, resultReg, childReg, cg); } generateRegRegImmInstruction(PSHUFDRegRegImm1, node, resultReg, resultReg, 0x44, cg); // 01 00 01 00 shuffle xxBA to BABA break; case TR::VectorFloat: generateRegRegImmInstruction(PSHUFDRegRegImm1, node, resultReg, childReg, 0x00, cg); // 00 00 00 00 shuffle xxxA to AAAA break; case TR::VectorDouble: generateRegRegImmInstruction(PSHUFDRegRegImm1, node, resultReg, childReg, 0x44, cg); // 01 00 01 00 shuffle xxBA to BABA break; default: if (cg->comp()->getOption(TR_TraceCG)) traceMsg(cg->comp(), "Unsupported data type, Node = %p\n", node); TR_ASSERT(false, "Unsupported data type"); break; } node->setRegister(resultReg); cg->decReferenceCount(childNode); return resultReg; }
TR::Register *OMR::X86::AMD64::TreeEvaluator::lbits2dEvaluator(TR::Node *node, TR::CodeGenerator *cg) { // TODO:AMD64: Peepholing TR::Node *child = node->getFirstChild(); TR::Register *sreg = cg->evaluate(child); TR::Register *treg = cg->allocateRegister(TR_FPR); generateRegRegInstruction(MOVQRegReg8, node, treg, sreg, cg); node->setRegister(treg); cg->decReferenceCount(child); return treg; }
TR::Register * OMR::X86::I386::CodeGenerator::longClobberEvaluate(TR::Node *node) { TR_ASSERT(node->getOpCode().is8Byte(), "assertion failure"); if (node->getReferenceCount() > 1) { TR::Register *temp = self()->evaluate(node); TR::Register *lowReg = self()->allocateRegister(); TR::Register *highReg = self()->allocateRegister(); TR::RegisterPair *longReg = self()->allocateRegisterPair(lowReg, highReg); generateRegRegInstruction(MOV4RegReg, node, lowReg, temp->getLowOrder(), self()); generateRegRegInstruction(MOV4RegReg, node, highReg, temp->getHighOrder(), self()); return longReg; } else { return self()->evaluate(node); } }
TR::Register *OMR::X86::AMD64::TreeEvaluator::lcmpEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Node *firstChild = node->getFirstChild(); TR::Node *secondChild = node->getSecondChild(); TR::Register *leftRegister = cg->evaluate(firstChild); TR::Register *rightRegister = cg->evaluate(secondChild); // Compare left and right operands, all finished with the operands after this. generateRegRegInstruction(CMP8RegReg, node, leftRegister, rightRegister, cg); cg->decReferenceCount(firstChild); cg->decReferenceCount(secondChild); TR::Register *isLessThanReg = cg->allocateRegister(); TR::Register *isNotEqualReg = cg->allocateRegister(); cg->getLiveRegisters(TR_GPR)->setByteRegisterAssociation(isLessThanReg); cg->getLiveRegisters(TR_GPR)->setByteRegisterAssociation(isNotEqualReg); // The state of things in each possible case after each instruction: // left < right left = right left > right // Processor flags: NE=1 LT=1 NE=0 LT=0 NE=1 LT=0 generateRegInstruction(SETL1Reg, node, isLessThanReg, cg); // isLessThanReg: 00000001 00000000 00000000 generateRegInstruction(SETNE1Reg, node, isNotEqualReg, cg); // isNotEqualReg: 00000001 00000000 00000001 generateRegInstruction(NEG1Reg, node, isLessThanReg, cg); // isLessThanReg: 11111111 00000000 00000000 generateRegRegInstruction(OR1RegReg, node, isNotEqualReg, isLessThanReg, cg); // isNotEqualReg: 11111111 00000000 00000001 generateRegRegInstruction(MOVSXReg4Reg1, node, isNotEqualReg, isNotEqualReg, cg); node->setRegister(isNotEqualReg); cg->stopUsingRegister(isLessThanReg); return isNotEqualReg; }
TR::Register *OMR::X86::AMD64::TreeEvaluator::l2iEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Node *child = node->getFirstChild(); TR::Register *reg = cg->evaluate(child); if (child->getReferenceCount() > 1) { // This catches two scenarios: // // 1) A longClobberEvaluate (or any other register-clobbering logic) on // the l2i node could see a refcount of 1, and hence won't make a copy. // If child's refcount is more than 1, we do in fact need a copy, so we'd // better do it here. // // 2) If the child is commoned, and the l2i node is also commoned, then // we may end up with a situation where the last evaluation of the child // is a clobberEvaluate. By that time, the child's refcount would be 1, // so no copy is made, and the register would be clobbered. Therefore, // the l2i node can't return that same register, or else the other uses // of the node will end up getting the clobbered value. // // Note that case 2 is conservative, in that it presumes that the child's // register will be clobbered by another node. If this does not occur, // then the copy we're about to make is unnecessary. // TR::Register *childReg = reg; reg = cg->allocateRegister(); // to support signExtension in GRA, need to preserve upper word // in this move generateRegRegInstruction(MOV8RegReg, node, reg, childReg, cg); } node->setRegister(reg); cg->decReferenceCount(child); if (cg->enableRegisterInterferences() && node->getOpCode().getSize() == 1) cg->getLiveRegisters(TR_GPR)->setByteRegisterAssociation(node->getRegister()); return reg; }
/* * users should call the integerSubtractAnalyser or integerSubtractAnalyserWithExplicitOperands APIs instead of calling this one directly */ TR::Register* TR_X86SubtractAnalyser::integerSubtractAnalyserImpl(TR::Node *root, TR::Node *firstChild, TR::Node *secondChild, TR_X86OpCodes regRegOpCode, TR_X86OpCodes regMemOpCode, TR_X86OpCodes copyOpCode, bool needsEflags, TR::Node *borrow) { TR::Register *targetRegister = NULL; TR::Register *firstRegister = firstChild->getRegister(); TR::Register *secondRegister = secondChild->getRegister(); setInputs(firstChild, firstRegister, secondChild, secondRegister); bool loadedConst = false; needsEflags = needsEflags || NEED_CC(root); if (getEvalChild1()) { // if firstChild and secondChild are the same node, then we should // evaluate (take the else path) so that the evaluate for the secondChild // below will get the correct/already-allocated register. if (firstRegister == 0 && firstChild->getOpCodeValue() == TR::iconst && (firstChild != secondChild)) { // An iconst may have to be generated. The iconst will be generated after the // secondChild is evaluated. Set the loadedConst flag to true. loadedConst = true; } else { firstRegister = _cg->evaluate(firstChild); } } if (getEvalChild2()) { secondRegister = _cg->evaluate(secondChild); if (firstChild->getRegister()) { firstRegister = firstChild->getRegister(); } else if (!loadedConst) { firstRegister = _cg->evaluate(firstChild); } } if (loadedConst) { if (firstRegister == 0) { // firstchild is an inconst and it has not been evaluated. // Generate the code for an iconst. firstRegister = _cg->allocateRegister(); TR::TreeEvaluator::insertLoadConstant(firstChild, firstRegister, firstChild->getInt(), TR_RematerializableInt, _cg); } else { // firstchild was evaluated. The code for an iconst does not need to be generated. // Set the loadConst flag to false. loadedConst = false; } } if (borrow != 0) TR_X86ComputeCC::setCarryBorrow(borrow, true, _cg); if (getCopyReg1()) { if (firstChild->getReferenceCount() > 1) { TR::Register *thirdReg; if (firstChild->getOpCodeValue() == TR::iconst && loadedConst) { thirdReg = firstRegister; } else { if (secondChild->getReferenceCount() == 1 && secondRegister != 0 && !needsEflags && (borrow == 0)) { // use one fewer registers if we negate the clobberable secondRegister and add // Don't do this though if condition codes are needed. The sequence // depends on the carry flag being valid as if a sub was done. // bool nodeIs64Bit = TR_X86OpCode(regRegOpCode).hasLongSource(); generateRegInstruction(NEGReg(nodeIs64Bit), secondChild, secondRegister, _cg); thirdReg = secondRegister; secondRegister = firstRegister; regRegOpCode = ADDRegReg(nodeIs64Bit); } else { thirdReg = _cg->allocateRegister(); generateRegRegInstruction(copyOpCode, root, thirdReg, firstRegister, _cg); } } targetRegister = thirdReg; if (getSubReg3Reg2()) { generateRegRegInstruction(regRegOpCode, root, thirdReg, secondRegister, _cg); } else // assert getSubReg3Mem2() == true { TR::MemoryReference *tempMR = generateX86MemoryReference(secondChild, _cg); generateRegMemInstruction(regMemOpCode, root, thirdReg, tempMR, _cg); tempMR->decNodeReferenceCounts(_cg); } } else { if (getSubReg3Reg2()) { generateRegRegInstruction(regRegOpCode, root, firstRegister, secondRegister, _cg); } else // assert getSubReg3Mem2() == true { TR::MemoryReference *tempMR = generateX86MemoryReference(secondChild, _cg); generateRegMemInstruction(regMemOpCode, root, firstRegister, tempMR, _cg); tempMR->decNodeReferenceCounts(_cg); } targetRegister = firstRegister; } } else if (getSubReg1Reg2()) { generateRegRegInstruction(regRegOpCode, root, firstRegister, secondRegister, _cg); targetRegister = firstRegister; } else // assert getSubReg1Mem2() == true { TR::MemoryReference *tempMR = generateX86MemoryReference(secondChild, _cg); generateRegMemInstruction(regMemOpCode, root, firstRegister, tempMR, _cg); targetRegister = firstRegister; tempMR->decNodeReferenceCounts(_cg); } return targetRegister; }
/* * users should call the longSubtractAnalyser or longSubtractAnalyserWithExplicitOperands APIs instead of calling this one directly */ TR::Register* TR_X86SubtractAnalyser::longSubtractAnalyserImpl(TR::Node *root, TR::Node *&firstChild, TR::Node *&secondChild) { TR::Register *firstRegister = firstChild->getRegister(); TR::Register *secondRegister = secondChild->getRegister(); TR::Register *targetRegister = NULL; bool firstHighZero = false; bool secondHighZero = false; bool useSecondHighOrder = false; TR_X86OpCodes regRegOpCode = SUB4RegReg; TR_X86OpCodes regMemOpCode = SUB4RegMem; bool needsEflags = NEED_CC(root) || (root->getOpCodeValue() == TR::lusubb); // Can generate better code for long adds when one or more children have a high order zero word // can avoid the evaluation when we don't need the result of such nodes for another parent. // if (firstChild->isHighWordZero() && !needsEflags) { firstHighZero = true; } if (secondChild->isHighWordZero() && !needsEflags) { secondHighZero = true; TR::ILOpCodes secondOp = secondChild->getOpCodeValue(); if (secondChild->getReferenceCount() == 1 && secondRegister == 0) { if (secondOp == TR::iu2l || secondOp == TR::su2l || secondOp == TR::bu2l || (secondOp == TR::lushr && secondChild->getSecondChild()->getOpCodeValue() == TR::iconst && (secondChild->getSecondChild()->getInt() & TR::TreeEvaluator::shiftMask(true)) == 32)) { secondChild = secondChild->getFirstChild(); secondRegister = secondChild->getRegister(); if (secondOp == TR::lushr) { useSecondHighOrder = true; } } } } setInputs(firstChild, firstRegister, secondChild, secondRegister); if (isVolatileMemoryOperand(firstChild)) resetMem1(); if (isVolatileMemoryOperand(secondChild)) resetMem2(); if (getEvalChild1()) { firstRegister = _cg->evaluate(firstChild); } if (getEvalChild2()) { secondRegister = _cg->evaluate(secondChild); } if (secondHighZero && secondRegister && secondRegister->getRegisterPair()) { if (!useSecondHighOrder) { secondRegister = secondRegister->getLowOrder(); } else { secondRegister = secondRegister->getHighOrder(); } } if (root->getOpCodeValue() == TR::lusubb && TR_X86ComputeCC::setCarryBorrow(root->getChild(2), true, _cg)) { // use SBB rather than SUB // regRegOpCode = SBB4RegReg; regMemOpCode = SBB4RegMem; } if (getCopyReg1()) { TR::Register *lowThird = _cg->allocateRegister(); TR::Register *highThird = _cg->allocateRegister(); TR::RegisterPair *thirdReg = _cg->allocateRegisterPair(lowThird, highThird); targetRegister = thirdReg; generateRegRegInstruction(MOV4RegReg, root, lowThird, firstRegister->getLowOrder(), _cg); if (firstHighZero) { generateRegRegInstruction(XOR4RegReg, root, highThird, highThird, _cg); } else { generateRegRegInstruction(MOV4RegReg, root, highThird, firstRegister->getHighOrder(), _cg); } if (getSubReg3Reg2()) { if (secondHighZero) { generateRegRegInstruction(regRegOpCode, root, lowThird, secondRegister, _cg); generateRegImmInstruction(SBB4RegImms, root, highThird, 0, _cg); } else { generateRegRegInstruction(regRegOpCode, root, lowThird, secondRegister->getLowOrder(), _cg); generateRegRegInstruction(SBB4RegReg, root, highThird, secondRegister->getHighOrder(), _cg); } } else // assert getSubReg3Mem2() == true { TR::MemoryReference *lowMR = generateX86MemoryReference(secondChild, _cg); /** * The below code is needed to ensure correct behaviour when the subtract analyser encounters a lushr bytecode that shifts * by 32 bits. This is the only case where the useSecondHighOrder bit is set. * When the first child of the lushr is in a register, code above handles the shift. When the first child of the lushr is in * memory, the below ensures that the upper part of the first child of the lushr is used as lowMR. */ if (useSecondHighOrder) { TR_ASSERT(secondHighZero, "useSecondHighOrder should be consistent with secondHighZero. useSecondHighOrder subsumes secondHighZero"); lowMR = generateX86MemoryReference(*lowMR, 4, _cg); } generateRegMemInstruction(regMemOpCode, root, lowThird, lowMR, _cg); if (secondHighZero) { generateRegImmInstruction(SBB4RegImms, root, highThird, 0, _cg); } else { TR::MemoryReference *highMR = generateX86MemoryReference(*lowMR, 4, _cg); generateRegMemInstruction(SBB4RegMem, root, highThird, highMR, _cg); } lowMR->decNodeReferenceCounts(_cg); } } else if (getSubReg1Reg2()) { if (secondHighZero) { generateRegRegInstruction(regRegOpCode, root, firstRegister->getLowOrder(), secondRegister, _cg); generateRegImmInstruction(SBB4RegImms, root, firstRegister->getHighOrder(), 0, _cg); } else { generateRegRegInstruction(regRegOpCode, root, firstRegister->getLowOrder(), secondRegister->getLowOrder(), _cg); generateRegRegInstruction(SBB4RegReg, root, firstRegister->getHighOrder(), secondRegister->getHighOrder(), _cg); } targetRegister = firstRegister; } else // assert getSubReg1Mem2() == true { TR::MemoryReference *lowMR = generateX86MemoryReference(secondChild, _cg); /** * The below code is needed to ensure correct behaviour when the subtract analyser encounters a lushr bytecode that shifts * by 32 bits. This is the only case where the useSecondHighOrder bit is set. * When the first child of the lushr is in a register, code above handles the shift. When the first child of the lushr is in * memory, the below ensures that the upper part of the first child of the lushr is used as lowMR. */ if (useSecondHighOrder) lowMR = generateX86MemoryReference(*lowMR, 4, _cg); generateRegMemInstruction(regMemOpCode, root, firstRegister->getLowOrder(), lowMR, _cg); if (secondHighZero) { generateRegImmInstruction(SBB4RegImms, root, firstRegister->getHighOrder(), 0, _cg); } else { TR::MemoryReference *highMR = generateX86MemoryReference(*lowMR, 4, _cg); generateRegMemInstruction(SBB4RegMem, root, firstRegister->getHighOrder(), highMR, _cg); } targetRegister = firstRegister; lowMR->decNodeReferenceCounts(_cg); } return targetRegister; }
TR::Register *OMR::X86::AMD64::TreeEvaluator::dbits2lEvaluator(TR::Node *node, TR::CodeGenerator *cg) { // TODO:AMD64: Peepholing TR::Node *child = node->getFirstChild(); TR::Register *sreg = cg->evaluate(child); TR::Register *treg = cg->allocateRegister(TR_GPR); generateRegRegInstruction(MOVQReg8Reg, node, treg, sreg, cg); if (node->normalizeNanValues()) { static char *disableFastNormalizeNaNs = feGetEnv("TR_disableFastNormalizeNaNs"); if (disableFastNormalizeNaNs) { // This one is not clever, but it is simple, and it's based directly // on the IA32 version which is known to work, so is safer. // TR::RegisterDependencyConditions *deps = generateRegisterDependencyConditions((uint8_t)0, (uint8_t)1, cg); deps->addPostCondition(treg, TR::RealRegister::NoReg, cg); TR::IA32ConstantDataSnippet *nan1Snippet = cg->findOrCreate8ByteConstant(node, DOUBLE_NAN_1_LOW); TR::IA32ConstantDataSnippet *nan2Snippet = cg->findOrCreate8ByteConstant(node, DOUBLE_NAN_2_LOW); TR::MemoryReference *nan1MR = generateX86MemoryReference(nan1Snippet, cg); TR::MemoryReference *nan2MR = generateX86MemoryReference(nan2Snippet, cg); TR::LabelSymbol *startLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); TR::LabelSymbol *normalizeLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); TR::LabelSymbol *endLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); startLabel->setStartInternalControlFlow(); endLabel ->setEndInternalControlFlow(); generateLabelInstruction( LABEL, node, startLabel, cg); generateRegMemInstruction( CMP8RegMem, node, treg, nan1MR, cg); generateLabelInstruction( JGE4, node, normalizeLabel, cg); generateRegMemInstruction( CMP8RegMem, node, treg, nan2MR, cg); generateLabelInstruction( JB4, node, endLabel, cg); generateLabelInstruction( LABEL, node, normalizeLabel, cg); generateRegImm64Instruction( MOV8RegImm64, node, treg, DOUBLE_NAN, cg); generateLabelInstruction( LABEL, node, endLabel, deps, cg); } else { // A bunch of bookkeeping // uint64_t nanDetector = DOUBLE_NAN_2_LOW; TR::RegisterDependencyConditions *internalControlFlowDeps = generateRegisterDependencyConditions((uint8_t)0, (uint8_t)1, cg); internalControlFlowDeps->addPostCondition(treg, TR::RealRegister::NoReg, cg); TR::RegisterDependencyConditions *helperDeps = generateRegisterDependencyConditions((uint8_t)1, (uint8_t)1, cg); helperDeps->addPreCondition( treg, TR::RealRegister::eax, cg); helperDeps->addPostCondition(treg, TR::RealRegister::eax, cg); TR::IA32ConstantDataSnippet *nanDetectorSnippet = cg->findOrCreate8ByteConstant(node, nanDetector); TR::MemoryReference *nanDetectorMR = generateX86MemoryReference(nanDetectorSnippet, cg); TR::LabelSymbol *startLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); TR::LabelSymbol *slowPathLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); TR::LabelSymbol *normalizeLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); TR::LabelSymbol *endLabel = TR::LabelSymbol::create(cg->trHeapMemory(),cg); startLabel->setStartInternalControlFlow(); endLabel ->setEndInternalControlFlow(); // Fast path: if subtracting nanDetector leaves CF=0 or OF=1, then it // must be a NaN. // generateLabelInstruction( LABEL, node, startLabel, cg); generateRegMemInstruction( CMP8RegMem, node, treg, nanDetectorMR, cg); generateLabelInstruction( JAE4, node, slowPathLabel, cg); generateLabelInstruction( JO4, node, slowPathLabel, cg); // Slow path // TR_OutlinedInstructions *slowPath = new (cg->trHeapMemory()) TR_OutlinedInstructions(slowPathLabel, cg); cg->getOutlinedInstructionsList().push_front(slowPath); slowPath->swapInstructionListsWithCompilation(); generateLabelInstruction(NULL, LABEL, slowPathLabel, cg)->setNode(node); generateRegImm64Instruction(MOV8RegImm64, node, treg, DOUBLE_NAN, cg); generateLabelInstruction( JMP4, node, endLabel, cg); slowPath->swapInstructionListsWithCompilation(); // Merge point // generateLabelInstruction(LABEL, node, endLabel, internalControlFlowDeps, cg); } } node->setRegister(treg); cg->decReferenceCount(child); return treg; }
// Build arguments for system linkage dispatch. // int32_t TR::AMD64SystemLinkage::buildArgs( TR::Node *callNode, TR::RegisterDependencyConditions *deps) { TR::SymbolReference *methodSymRef = callNode->getSymbolReference(); TR::MethodSymbol *methodSymbol = methodSymRef->getSymbol()->castToMethodSymbol(); TR::RealRegister::RegNum noReg = TR::RealRegister::NoReg; TR::RealRegister *espReal = machine()->getX86RealRegister(TR::RealRegister::esp); int32_t firstNodeArgument = callNode->getFirstArgumentIndex(); int32_t lastNodeArgument = callNode->getNumChildren() - 1; int32_t offset = 0; int32_t sizeOfOutGoingArgs= 0; uint16_t numIntArgs = 0, numFloatArgs = 0; int32_t first, last, direction; int32_t numCopiedRegs = 0; TR::Register *copiedRegs[TR::X86LinkageProperties::MaxArgumentRegisters]; if (getProperties().passArgsRightToLeft()) { first = lastNodeArgument; last = firstNodeArgument - 1; direction = -1; } else { first = firstNodeArgument; last = lastNodeArgument + 1; direction = 1; } // If the dispatch is indirect we must add the VFT register to the preconditions // so that it gets register assigned with the other preconditions to the call. // if (callNode->getOpCode().isIndirect()) { TR::Node *vftChild = callNode->getFirstChild(); TR_ASSERT(vftChild->getRegister(), "expecting VFT child to be evaluated"); TR::RealRegister::RegNum scratchRegIndex = getProperties().getIntegerScratchRegister(1); deps->addPreCondition(vftChild->getRegister(), scratchRegIndex, cg()); } int32_t i; for (i = first; i != last; i += direction) { TR::parmLayoutResult layoutResult; TR::RealRegister::RegNum rregIndex = noReg; TR::Node *child = callNode->getChild(i); layoutParm(child, sizeOfOutGoingArgs, numIntArgs, numFloatArgs, layoutResult); if (layoutResult.abstract & TR::parmLayoutResult::IN_LINKAGE_REG_PAIR) { // TODO: AMD64 SysV ABI might put a struct into a pair of linkage registerr TR_ASSERT(false, "haven't support linkage_reg_pair yet.\n"); } else if (layoutResult.abstract & TR::parmLayoutResult::IN_LINKAGE_REG) { TR_RegisterKinds regKind = layoutResult.regs[0].regKind; uint32_t regIndex = layoutResult.regs[0].regIndex; TR_ASSERT(regKind == TR_GPR || regKind == TR_FPR, "linkage registers includes TR_GPR and TR_FPR\n"); rregIndex = (regKind == TR_FPR) ? getProperties().getFloatArgumentRegister(regIndex): getProperties().getIntegerArgumentRegister(regIndex); } else { offset = layoutResult.offset; } TR::Register *vreg; vreg = cg()->evaluate(child); bool needsStackOffsetUpdate = false; if (rregIndex != noReg) { // For NULL JNI reference parameters, it is possible that the NULL value will be evaluated into // a different register than the child. In that case it is not necessary to copy the temporary scratch // register across the call. // if ((child->getReferenceCount() > 1) && (vreg == child->getRegister())) { TR::Register *argReg = cg()->allocateRegister(); if (vreg->containsCollectedReference()) argReg->setContainsCollectedReference(); generateRegRegInstruction(TR::Linkage::movOpcodes(RegReg, movType(child->getDataType())), child, argReg, vreg, cg()); vreg = argReg; copiedRegs[numCopiedRegs++] = vreg; } deps->addPreCondition(vreg, rregIndex, cg()); } else { // Ideally, we would like to push rather than move generateMemRegInstruction(TR::Linkage::movOpcodes(MemReg, fullRegisterMovType(vreg)), child, generateX86MemoryReference(espReal, offset, cg()), vreg, cg()); } cg()->decReferenceCount(child); } // Now that we're finished making the preconditions, all the interferences // are established and we can kill these regs. // for (i = 0; i < numCopiedRegs; i++) cg()->stopUsingRegister(copiedRegs[i]); deps->stopAddingPreConditions(); return sizeOfOutGoingArgs; }
// Copies parameters from where they enter the method (either on stack or in a // linkage register) to their "home location" where the method body will expect // to find them (either on stack or in a global register). // TR::Instruction * TR::X86SystemLinkage::copyParametersToHomeLocation(TR::Instruction *cursor) { TR::Machine *machine = cg()->machine(); TR::RealRegister *framePointer = machine->getX86RealRegister(TR::RealRegister::vfp); TR::ResolvedMethodSymbol *bodySymbol = comp()->getJittedMethodSymbol(); ListIterator<TR::ParameterSymbol> paramIterator(&(bodySymbol->getParameterList())); TR::ParameterSymbol *paramCursor; const TR::RealRegister::RegNum noReg = TR::RealRegister::NoReg; TR_ASSERT(noReg == 0, "noReg must be zero so zero-initializing movStatus will work"); TR::MovStatus movStatus[TR::RealRegister::NumRegisters] = {{(TR::RealRegister::RegNum)0,(TR::RealRegister::RegNum)0,(TR_MovDataTypes)0}}; // We must always do the stores first, then the reg-reg copies, then the // loads, so that we never clobber a register we will need later. However, // the logic is simpler if we do the loads and stores in the same loop. // Therefore, we maintain a separate instruction cursor for the loads. // // We defer the initialization of loadCursor until we generate the first // load. Otherwise, if we happen to generate some stores first, then the // store cursor would get ahead of the loadCursor, and the instructions // would end up in the wrong order despite our efforts. // TR::Instruction *loadCursor = NULL; // Phase 1: generate RegMem and MemReg movs, and collect information about // the required RegReg movs. // for (paramCursor = paramIterator.getFirst(); paramCursor != NULL; paramCursor = paramIterator.getNext()) { int8_t lri = paramCursor->getLinkageRegisterIndex(); // How the parameter enters the method TR::RealRegister::RegNum ai // Where method body expects to find it = (TR::RealRegister::RegNum)paramCursor->getAllocatedIndex(); int32_t offset = paramCursor->getParameterOffset(); // Location of the parameter's stack slot TR_MovDataTypes movDataType = paramMovType(paramCursor); // What sort of MOV instruction does it need? // Copy the parameter to wherever it should be // if (lri == NOT_LINKAGE) // It's on the stack { if (ai == NOT_ASSIGNED) // It only needs to be on the stack { // Nothing to do } else // Method body expects it to be in the ai register { if (loadCursor == NULL) loadCursor = cursor; if (debug("traceCopyParametersToHomeLocation")) diagnostic("copyParametersToHomeLocation: Loading %d\n", ai); // ai := stack loadCursor = generateRegMemInstruction( loadCursor, TR::Linkage::movOpcodes(RegMem, movDataType), machine->getX86RealRegister(ai), generateX86MemoryReference(framePointer, offset, cg()), cg() ); } } else // It's in a linkage register { TR::RealRegister::RegNum sourceIndex = getProperties().getArgumentRegister(lri, isFloat(movDataType)); // Copy to the stack if necessary // if (ai == NOT_ASSIGNED || hasToBeOnStack(paramCursor)) { if (comp()->getOption(TR_TraceCG)) traceMsg(comp(), "copyToHomeLocation param %p, linkage reg index %d, allocated index %d, parameter offset %d, hasToBeOnStack %d, parm->isParmHasToBeOnStack() %d.\n", paramCursor, lri, ai, offset, hasToBeOnStack(paramCursor), paramCursor->isParmHasToBeOnStack()); if (debug("traceCopyParametersToHomeLocation")) diagnostic("copyParametersToHomeLocation: Storing %d\n", sourceIndex); // stack := lri cursor = generateMemRegInstruction( cursor, TR::Linkage::movOpcodes(MemReg, movDataType), generateX86MemoryReference(framePointer, offset, cg()), machine->getX86RealRegister(sourceIndex), cg() ); } // Copy to the ai register if necessary // if (ai != NOT_ASSIGNED && ai != sourceIndex) { // This parameter needs a RegReg move. We don't know yet whether // we need the value in the target register, so for now we just // remember that we need to do this and keep going. // TR_ASSERT(movStatus[ai ].sourceReg == noReg, "Each target reg must have only one source"); TR_ASSERT(movStatus[sourceIndex].targetReg == noReg, "Each source reg must have only one target"); if (debug("traceCopyParametersToHomeLocation")) diagnostic("copyParametersToHomeLocation: Planning to move %d to %d\n", sourceIndex, ai); movStatus[ai].sourceReg = sourceIndex; movStatus[sourceIndex].targetReg = ai; movStatus[sourceIndex].outgoingDataType = movDataType; } if (debug("traceCopyParametersToHomeLocation") && ai == sourceIndex) { diagnostic("copyParametersToHomeLocation: Parameter #%d already in register %d\n", lri, ai); } } } // Phase 2: Iterate through the parameters again to insert the RegReg moves. // for (paramCursor = paramIterator.getFirst(); paramCursor != NULL; paramCursor = paramIterator.getNext()) { if (paramCursor->getLinkageRegisterIndex() == NOT_LINKAGE) continue; const TR::RealRegister::RegNum paramReg = getProperties().getArgumentRegister(paramCursor->getLinkageRegisterIndex(), isFloat(paramMovType(paramCursor))); if (movStatus[paramReg].targetReg == 0) { // This parameter does not need to be copied anywhere if (debug("traceCopyParametersToHomeLocation")) diagnostic("copyParametersToHomeLocation: Not moving %d\n", paramReg); } else { if (debug("traceCopyParametersToHomeLocation")) diagnostic("copyParametersToHomeLocation: Preparing to move %d\n", paramReg); // If a mov's target register is the source for another mov, we need // to do that other mov first. The idea is to find the end point of // the chain of movs starting with paramReg and ending with a // register whose current value is not needed; then do that chain of // movs in reverse order. // TR_ASSERT(noReg == 0, "noReg must be zero (not %d) for zero-filled initialization to work", noReg); TR::RealRegister::RegNum regCursor; // Find the last target in the chain // regCursor = movStatus[paramReg].targetReg; while(movStatus[regCursor].targetReg != noReg) { // Haven't found the end yet regCursor = movStatus[regCursor].targetReg; TR_ASSERT(regCursor != paramReg, "Can't yet handle cyclic dependencies"); // TODO:AMD64 Use scratch register to break cycles // A properly-written pickRegister should never // cause cycles to occur in the first place. However, we may want // to consider adding cycle-breaking logic so that (1) pickRegister // has more flexibility, and (2) we're more robust against // otherwise harmless bugs in pickRegister. } // Work our way backward along the chain, generating all the necessary movs // while(movStatus[regCursor].sourceReg != noReg) { TR::RealRegister::RegNum source = movStatus[regCursor].sourceReg; if (debug("traceCopyParametersToHomeLocation")) diagnostic("copyParametersToHomeLocation: Moving %d to %d\n", source, regCursor); // regCursor := regCursor.sourceReg cursor = generateRegRegInstruction( cursor, TR::Linkage::movOpcodes(RegReg, movStatus[source].outgoingDataType), machine->getX86RealRegister(regCursor), machine->getX86RealRegister(source), cg() ); // Update movStatus as we go so we don't generate redundant movs movStatus[regCursor].sourceReg = noReg; movStatus[source ].targetReg = noReg; // Continue with the next register in the chain regCursor = source; } } } // Return the last instruction we inserted, whether or not it was a load. // return loadCursor? loadCursor : cursor; }
TR::Register *TR_IA32XMMCompareAnalyser::xmmCompareAnalyser(TR::Node *root, TR_X86OpCodes cmpRegRegOpCode, TR_X86OpCodes cmpRegMemOpCode) { TR::Node *firstChild, *secondChild; TR::ILOpCodes cmpOp = root->getOpCodeValue(); bool reverseMemOp = false; bool reverseCmpOp = false; // Some operators must have their operands swapped to improve the generated // code needed to evaluate the result of the comparison. // bool mustSwapOperands = (cmpOp == TR::iffcmple || cmpOp == TR::ifdcmple || cmpOp == TR::iffcmpgtu || cmpOp == TR::ifdcmpgtu || cmpOp == TR::fcmple || cmpOp == TR::dcmple || cmpOp == TR::fcmpgtu || cmpOp == TR::dcmpgtu || cmpOp == TR::iffcmplt || cmpOp == TR::ifdcmplt || cmpOp == TR::iffcmpgeu || cmpOp == TR::ifdcmpgeu || cmpOp == TR::fcmplt || cmpOp == TR::dcmplt || cmpOp == TR::fcmpgeu || cmpOp == TR::dcmpgeu) ? true : false; // Some operators should not have their operands swapped to improve the generated // code needed to evaluate the result of the comparison. // bool preventOperandSwapping = (cmpOp == TR::iffcmpltu || cmpOp == TR::ifdcmpltu || cmpOp == TR::iffcmpge || cmpOp == TR::ifdcmpge || cmpOp == TR::fcmpltu || cmpOp == TR::dcmpltu || cmpOp == TR::fcmpge || cmpOp == TR::dcmpge || cmpOp == TR::iffcmpgt || cmpOp == TR::ifdcmpgt || cmpOp == TR::iffcmpleu || cmpOp == TR::ifdcmpleu || cmpOp == TR::fcmpgt || cmpOp == TR::dcmpgt || cmpOp == TR::fcmpleu || cmpOp == TR::dcmpleu) ? true : false; // For correctness, don't swap operands of these operators. // if (cmpOp == TR::fcmpg || cmpOp == TR::fcmpl || cmpOp == TR::dcmpg || cmpOp == TR::dcmpl) { preventOperandSwapping = true; } // Initial operand evaluation ordering. // if (preventOperandSwapping || (!mustSwapOperands && _cg->whichChildToEvaluate(root) == 0)) { firstChild = root->getFirstChild(); secondChild = root->getSecondChild(); setReversedOperands(false); } else { firstChild = root->getSecondChild(); secondChild = root->getFirstChild(); setReversedOperands(true); } TR::Register *firstRegister = firstChild->getRegister(); TR::Register *secondRegister = secondChild->getRegister(); setInputs(firstChild, firstRegister, secondChild, secondRegister, false, // If either 'preventOperandSwapping' or 'mustSwapOperands' is set then the // initial operand ordering set above must be maintained. // preventOperandSwapping || mustSwapOperands); // Make sure any required operand ordering is respected. // if ((getCmpReg2Reg1() || getCmpReg2Mem1()) && (mustSwapOperands || preventOperandSwapping)) { reverseCmpOp = getCmpReg2Reg1() ? true : false; reverseMemOp = getCmpReg2Mem1() ? true : false; } // Evaluate the children if necessary. // if (getEvalChild1()) { _cg->evaluate(firstChild); } if (getEvalChild2()) { _cg->evaluate(secondChild); } TR::TreeEvaluator::coerceFPOperandsToXMMRs(root, _cg); firstRegister = firstChild->getRegister(); secondRegister = secondChild->getRegister(); // Generate the compare instruction. // if (getCmpReg1Mem2() || reverseMemOp) { TR::MemoryReference *tempMR = generateX86MemoryReference(secondChild, _cg); generateRegMemInstruction(cmpRegMemOpCode, root, firstRegister, tempMR, _cg); tempMR->decNodeReferenceCounts(_cg); } else if (getCmpReg2Mem1()) { TR::MemoryReference *tempMR = generateX86MemoryReference(firstChild, _cg); generateRegMemInstruction(cmpRegMemOpCode, root, secondRegister, tempMR, _cg); notReversedOperands(); tempMR->decNodeReferenceCounts(_cg); } else if (getCmpReg1Reg2() || reverseCmpOp) { generateRegRegInstruction(cmpRegRegOpCode, root, firstRegister, secondRegister, _cg); } else if (getCmpReg2Reg1()) { generateRegRegInstruction(cmpRegRegOpCode, root, secondRegister, firstRegister, _cg); notReversedOperands(); } _cg->decReferenceCount(firstChild); _cg->decReferenceCount(secondChild); // Update the opcode on the root node if we have swapped its children. // TODO: Reverse the children too, or else this looks wrong in the log file // if (getReversedOperands()) { cmpOp = TR::ILOpCode(cmpOp).getOpCodeForSwapChildren(); TR::Node::recreate(root, cmpOp); } return NULL; }
TR::Register* OMR::X86::TreeEvaluator::SIMDgetvelemEvaluator(TR::Node* node, TR::CodeGenerator* cg) { TR::Node* firstChild = node->getChild(0); TR::Node* secondChild = node->getChild(1); TR::Register* srcVectorReg = cg->evaluate(firstChild); TR::Register* resReg = 0; TR::Register* lowResReg = 0; TR::Register* highResReg = 0; int32_t elementCount = -1; switch (firstChild->getDataType()) { case TR::VectorInt8: case TR::VectorInt16: TR_ASSERT(false, "unsupported vector type %s in SIMDgetvelemEvaluator.\n", firstChild->getDataType().toString()); break; case TR::VectorInt32: elementCount = 4; resReg = cg->allocateRegister(); break; case TR::VectorInt64: elementCount = 2; if (TR::Compiler->target.is32Bit()) { lowResReg = cg->allocateRegister(); highResReg = cg->allocateRegister(); resReg = cg->allocateRegisterPair(lowResReg, highResReg); } else { resReg = cg->allocateRegister(); } break; case TR::VectorFloat: elementCount = 4; resReg = cg->allocateSinglePrecisionRegister(TR_FPR); break; case TR::VectorDouble: elementCount = 2; resReg = cg->allocateRegister(TR_FPR); break; default: TR_ASSERT(false, "unrecognized vector type %s in SIMDgetvelemEvaluator.\n", firstChild->getDataType().toString()); } if (secondChild->getOpCode().isLoadConst()) { int32_t elem = secondChild->getInt(); TR_ASSERT(elem >= 0 && elem < elementCount, "Element can only be 0 to %u\n", elementCount - 1); uint8_t shufconst = 0x00; TR::Register* dstReg = 0; if (4 == elementCount) { /* * if elem = 0, access the most significant 32 bits (set shufconst to 0x03) * if elem = 1, access the second most significant 32 bits (set shufconst to 0x02) * if elem = 2, access the third most significant 32 bits (set shufconst to 0x01) * if elem = 3, access the least significant 32 bits (set shufconst to 0x00) */ shufconst = (uint8_t)((3 - elem) & 0x03); /* * the value to be read (indicated by shufconst) from srcVectorReg is splatted into all 4 slots in the dstReg * this puts the value we want in the least significant bits and the other bits should never be read. * for float, dstReg and resReg are the same because PSHUFD can work directly with TR_FPR registers * for Int32, the result needs to be moved from the dstReg to a TR_GPR resReg. */ if (TR::VectorInt32 == firstChild->getDataType()) { dstReg = cg->allocateRegister(TR_VRF); } else //TR::VectorFloat == firstChild->getDataType() { dstReg = resReg; } /* * if elem = 3, the value we want is already in the least significant 32 bits * as a result, a mov instruction is good enough and splatting the value is unnecessary */ if (3 == elem) { generateRegRegInstruction(MOVDQURegReg, node, dstReg, srcVectorReg, cg); } else { generateRegRegImmInstruction(PSHUFDRegRegImm1, node, dstReg, srcVectorReg, shufconst, cg); } if (TR::VectorInt32 == firstChild->getDataType()) { generateRegRegInstruction(MOVDReg4Reg, node, resReg, dstReg, cg); cg->stopUsingRegister(dstReg); } } else //2 == elementCount { /* * for double, dstReg and resReg are the same because PSHUFD can work directly with TR_FPR registers * for Int64, the result needs to be moved from the dstReg to a TR_GPR resReg. */ if (TR::VectorInt64 == firstChild->getDataType()) { dstReg = cg->allocateRegister(TR_VRF); } else //TR::VectorDouble == firstChild->getDataType() { dstReg = resReg; } /* * the value to be read needs to be in the least significant 64 bits. * if elem = 0, the value we want is in the most significant 64 bits and needs to be splatted into * the least significant 64 bits (the other bits affected by the splat are never read) * if elem = 1, the value we want is already in the least significant 64 bits * as a result, a mov instruction is good enough and splatting the value is unnecessary */ if (1 == elem) { generateRegRegInstruction(MOVDQURegReg, node, dstReg, srcVectorReg, cg); } else //0 == elem { generateRegRegImmInstruction(PSHUFDRegRegImm1, node, dstReg, srcVectorReg, 0x0e, cg); } if (TR::VectorInt64 == firstChild->getDataType()) { if (TR::Compiler->target.is32Bit()) { generateRegRegInstruction(MOVDReg4Reg, node, lowResReg, dstReg, cg); generateRegRegImmInstruction(PSHUFDRegRegImm1, node, dstReg, srcVectorReg, (0 == elem) ? 0x03 : 0x01, cg); generateRegRegInstruction(MOVDReg4Reg, node, highResReg, dstReg, cg); } else { generateRegRegInstruction(MOVQReg8Reg, node, resReg, dstReg, cg); } cg->stopUsingRegister(dstReg); } } } else { //TODO: handle non-constant second child case TR_ASSERT(false, "non-const second child not currently supported in SIMDgetvelemEvaluator.\n"); } node->setRegister(resReg); cg->decReferenceCount(firstChild); cg->decReferenceCount(secondChild); return resReg; }
void TR_OutlinedInstructions::generateOutlinedInstructionsDispatch() { // Switch to cold helper instruction stream. // TR::Register *vmThreadReg = _cg->getMethodMetaDataRegister(); TR::Instruction *savedFirstInstruction = comp()->getFirstInstruction(); TR::Instruction *savedAppendInstruction = comp()->getAppendInstruction(); comp()->setFirstInstruction(NULL); comp()->setAppendInstruction(NULL); new (_cg->trHeapMemory()) TR::X86LabelInstruction(NULL, LABEL, _entryLabel, _cg); if (_rematerializeVMThread) { generateRegInstruction(PUSHReg, _callNode, vmThreadReg, _cg); generateRestoreVMThreadInstruction ( _callNode, _cg); TR::MemoryReference *vmThreadMR = generateX86MemoryReference(vmThreadReg, (TR::Compiler->target.is64Bit()) ? 16 : 8, _cg); generateRegMemInstruction (LRegMem(), _callNode, vmThreadReg, vmThreadMR, _cg); } TR::Register *resultReg=NULL; if (_callNode->getOpCode().isCallIndirect()) resultReg = TR::TreeEvaluator::performCall(_callNode, true, false, _cg); else resultReg = TR::TreeEvaluator::performCall(_callNode, false, false, _cg); if (_rematerializeVMThread) { generateRegInstruction(POPReg, _callNode, vmThreadReg, _cg); } if (_targetReg) { TR_ASSERT(resultReg, "assertion failure"); TR::RegisterPair *targetRegPair = _targetReg->getRegisterPair(); TR::RegisterPair *resultRegPair = resultReg->getRegisterPair(); if (targetRegPair) { TR_ASSERT(resultRegPair, "OutlinedInstructions: targetReg is a register pair and resultReg is not"); generateRegRegInstruction(_targetRegMovOpcode, _callNode, targetRegPair->getLowOrder(), resultRegPair->getLowOrder(), _cg); generateRegRegInstruction(_targetRegMovOpcode, _callNode, targetRegPair->getHighOrder(), resultRegPair->getHighOrder(), _cg); } else { TR_ASSERT(!resultRegPair, "OutlinedInstructions: resultReg is a register pair and targetReg is not"); generateRegRegInstruction(_targetRegMovOpcode, _callNode, _targetReg, resultReg, _cg); } } _cg->decReferenceCount(_callNode); if (_restartLabel) generateLabelInstruction(JMP4, _callNode, _restartLabel, _cg); else { // Java-specific. // No restart label implies we're not coming back from this call, // so it's safe to put data after the call. In the case of calling a throw // helper, there's an ancient busted handshake that expects to find a 4-byte // offset here, so we have to comply... // // When the handshake is removed, we can delete this zero. // generateImmInstruction(DDImm4, _callNode, 0, _cg); } // Dummy label to delimit the end of the helper call dispatch sequence (for exception ranges). // generateLabelInstruction(LABEL, _callNode, TR::LabelSymbol::create(_cg->trHeapMemory(),_cg), _cg); // Switch from cold helper instruction stream. // _firstInstruction = comp()->getFirstInstruction(); _appendInstruction = comp()->getAppendInstruction(); comp()->setFirstInstruction(savedFirstInstruction); comp()->setAppendInstruction(savedAppendInstruction); }