void SelectionDAGBuilder::LowerStatepoint( ImmutableStatepoint ISP, MachineBasicBlock *LandingPad /*=nullptr*/) { // The basic scheme here is that information about both the original call and // the safepoint is encoded in the CallInst. We create a temporary call and // lower it, then reverse engineer the calling sequence. NumOfStatepoints++; // Clear state StatepointLowering.startNewStatepoint(*this); ImmutableCallSite CS(ISP.getCallSite()); #ifndef NDEBUG // Consistency check. Don't do this for invokes. It would be too // expensive to preserve this information across different basic blocks if (!CS.isInvoke()) { for (const User *U : CS->users()) { const CallInst *Call = cast<CallInst>(U); if (isGCRelocate(Call)) StatepointLowering.scheduleRelocCall(*Call); } } #endif #ifndef NDEBUG // If this is a malformed statepoint, report it early to simplify debugging. // This should catch any IR level mistake that's made when constructing or // transforming statepoints. ISP.verify(); // Check that the associated GCStrategy expects to encounter statepoints. assert(GFI->getStrategy().useStatepoints() && "GCStrategy does not expect to encounter statepoints"); #endif // Lower statepoint vmstate and gcstate arguments SmallVector<SDValue, 10> LoweredMetaArgs; lowerStatepointMetaArgs(LoweredMetaArgs, ISP, *this); // Get call node, we will replace it later with statepoint SDNode *CallNode = lowerCallFromStatepoint(ISP, LandingPad, *this, PendingExports); // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END // nodes with all the appropriate arguments and return values. // Call Node: Chain, Target, {Args}, RegMask, [Glue] SDValue Chain = CallNode->getOperand(0); SDValue Glue; bool CallHasIncomingGlue = CallNode->getGluedNode(); if (CallHasIncomingGlue) { // Glue is always last operand Glue = CallNode->getOperand(CallNode->getNumOperands() - 1); } // Build the GC_TRANSITION_START node if necessary. // // The operands to the GC_TRANSITION_{START,END} nodes are laid out in the // order in which they appear in the call to the statepoint intrinsic. If // any of the operands is a pointer-typed, that operand is immediately // followed by a SRCVALUE for the pointer that may be used during lowering // (e.g. to form MachinePointerInfo values for loads/stores). const bool IsGCTransition = (ISP.getFlags() & (uint64_t)StatepointFlags::GCTransition) == (uint64_t)StatepointFlags::GCTransition; if (IsGCTransition) { SmallVector<SDValue, 8> TSOps; // Add chain TSOps.push_back(Chain); // Add GC transition arguments for (const Value *V : ISP.gc_transition_args()) { TSOps.push_back(getValue(V)); if (V->getType()->isPointerTy()) TSOps.push_back(DAG.getSrcValue(V)); } // Add glue if necessary if (CallHasIncomingGlue) TSOps.push_back(Glue); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue GCTransitionStart = DAG.getNode(ISD::GC_TRANSITION_START, getCurSDLoc(), NodeTys, TSOps); Chain = GCTransitionStart.getValue(0); Glue = GCTransitionStart.getValue(1); } // TODO: Currently, all of these operands are being marked as read/write in // PrologEpilougeInserter.cpp, we should special case the VMState arguments // and flags to be read-only. SmallVector<SDValue, 40> Ops; // Add the <id> and <numBytes> constants. Ops.push_back(DAG.getTargetConstant(ISP.getID(), getCurSDLoc(), MVT::i64)); Ops.push_back( DAG.getTargetConstant(ISP.getNumPatchBytes(), getCurSDLoc(), MVT::i32)); // Calculate and push starting position of vmstate arguments // Get number of arguments incoming directly into call node unsigned NumCallRegArgs = CallNode->getNumOperands() - (CallHasIncomingGlue ? 4 : 3); Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, getCurSDLoc(), MVT::i32)); // Add call target SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0); Ops.push_back(CallTarget); // Add call arguments // Get position of register mask in the call SDNode::op_iterator RegMaskIt; if (CallHasIncomingGlue) RegMaskIt = CallNode->op_end() - 2; else RegMaskIt = CallNode->op_end() - 1; Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt); // Add a constant argument for the calling convention pushStackMapConstant(Ops, *this, CS.getCallingConv()); // Add a constant argument for the flags uint64_t Flags = ISP.getFlags(); assert( ((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0) && "unknown flag used"); pushStackMapConstant(Ops, *this, Flags); // Insert all vmstate and gcstate arguments Ops.insert(Ops.end(), LoweredMetaArgs.begin(), LoweredMetaArgs.end()); // Add register mask from call node Ops.push_back(*RegMaskIt); // Add chain Ops.push_back(Chain); // Same for the glue, but we add it only if original call had it if (Glue.getNode()) Ops.push_back(Glue); // Compute return values. Provide a glue output since we consume one as // input. This allows someone else to chain off us as needed. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops); SDNode *SinkNode = StatepointMCNode; // Build the GC_TRANSITION_END node if necessary. // // See the comment above regarding GC_TRANSITION_START for the layout of // the operands to the GC_TRANSITION_END node. if (IsGCTransition) { SmallVector<SDValue, 8> TEOps; // Add chain TEOps.push_back(SDValue(StatepointMCNode, 0)); // Add GC transition arguments for (const Value *V : ISP.gc_transition_args()) { TEOps.push_back(getValue(V)); if (V->getType()->isPointerTy()) TEOps.push_back(DAG.getSrcValue(V)); } // Add glue TEOps.push_back(SDValue(StatepointMCNode, 1)); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue GCTransitionStart = DAG.getNode(ISD::GC_TRANSITION_END, getCurSDLoc(), NodeTys, TEOps); SinkNode = GCTransitionStart.getNode(); } // Replace original call DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root // Remove originall call node DAG.DeleteNode(CallNode); // DON'T set the root - under the assumption that it's already set past the // inserted node we created. // TODO: A better future implementation would be to emit a single variable // argument, variable return value STATEPOINT node here and then hookup the // return value of each gc.relocate to the respective output of the // previously emitted STATEPOINT value. Unfortunately, this doesn't appear // to actually be possible today. }
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { return NULL; // Already selected. } switch (Opc) { default: break; case AMDGPUISD::CONST_ADDRESS: { for (SDNode::use_iterator I = N->use_begin(), Next = llvm::next(I); I != SDNode::use_end(); I = Next) { Next = llvm::next(I); if (!I->isMachineOpcode()) { continue; } unsigned Opcode = I->getMachineOpcode(); bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; int SrcIdx = I.getOperandNo(); int SelIdx; // Unlike MachineInstrs, SDNodes do not have results in their operand // list, so we need to increment the SrcIdx, since // R600InstrInfo::getOperandIdx is based on the MachineInstr indices. if (HasDst) { SrcIdx++; } SelIdx = TII->getSelIdx(I->getMachineOpcode(), SrcIdx); if (SelIdx < 0) { continue; } SDValue CstOffset; if (N->getValueType(0).isVector() || !SelectGlobalValueConstantOffset(N->getOperand(0), CstOffset)) continue; // Gather constants values int SrcIndices[] = { TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) }; std::vector<unsigned> Consts; for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) { int OtherSrcIdx = SrcIndices[i]; int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); if (OtherSrcIdx < 0 || OtherSelIdx < 0) { continue; } if (HasDst) { OtherSrcIdx--; OtherSelIdx--; } if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(I->getOperand(OtherSrcIdx))) { if (Reg->getReg() == AMDGPU::ALU_CONST) { ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(I->getOperand(OtherSelIdx)); Consts.push_back(Cst->getZExtValue()); } } } ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); Consts.push_back(Cst->getZExtValue()); if (!TII->fitsConstReadLimitations(Consts)) continue; // Convert back to SDNode indices if (HasDst) { SrcIdx--; SelIdx--; } std::vector<SDValue> Ops; for (int i = 0, e = I->getNumOperands(); i != e; ++i) { if (i == SrcIdx) { Ops.push_back(CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32)); } else if (i == SelIdx) { Ops.push_back(CstOffset); } else { Ops.push_back(I->getOperand(i)); } } CurDAG->UpdateNodeOperands(*I, Ops.data(), Ops.size()); } break; } case ISD::BUILD_VECTOR: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } unsigned RegClassID; switch(N->getValueType(0).getVectorNumElements()) { case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { CurDAG->getTargetConstant(RegClassID, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) }; bool IsRegSeq = true; for (unsigned i = 0; i < N->getNumOperands(); i++) { if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; break; } RegSeqArgs[2 * i + 1] = N->getOperand(i); } if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs, 2 * N->getNumOperands() + 1); } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } if (N->getValueType(0) == MVT::i128) { RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32); } else if (N->getValueType(0) == MVT::i64) { RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); } else { llvm_unreachable("Unhandled value type for BUILD_PAIR"); } const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, N->getOperand(1), SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N), N->getValueType(0), Ops); } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); // XXX: Custom immediate lowering not implemented yet. Instead we use // pseudo instructions defined in SIInstructions.td if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } uint64_t ImmValue = 0; unsigned ImmReg = AMDGPU::ALU_LITERAL_X; if (N->getOpcode() == ISD::ConstantFP) { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::f64); ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N); APFloat Value = C->getValueAPF(); float FloatValue = Value.convertToFloat(); if (FloatValue == 0.0) { ImmReg = AMDGPU::ZERO; } else if (FloatValue == 0.5) { ImmReg = AMDGPU::HALF; } else if (FloatValue == 1.0) { ImmReg = AMDGPU::ONE; } else { ImmValue = Value.bitcastToAPInt().getZExtValue(); } } else { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::i64); ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); if (C->getZExtValue() == 0) { ImmReg = AMDGPU::ZERO; } else if (C->getZExtValue() == 1) { ImmReg = AMDGPU::ONE_INT; } else { ImmValue = C->getZExtValue(); } } for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); Use != SDNode::use_end(); Use = Next) { Next = llvm::next(Use); std::vector<SDValue> Ops; for (unsigned i = 0; i < Use->getNumOperands(); ++i) { Ops.push_back(Use->getOperand(i)); } if (!Use->isMachineOpcode()) { if (ImmReg == AMDGPU::ALU_LITERAL_X) { // We can only use literal constants (e.g. AMDGPU::ZERO, // AMDGPU::ONE, etc) in machine opcodes. continue; } } else { if (!TII->isALUInstr(Use->getMachineOpcode()) || (TII->get(Use->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)) { continue; } int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), AMDGPU::OpName::literal); if (ImmIdx == -1) { continue; } if (TII->getOperandIdx(Use->getMachineOpcode(), AMDGPU::OpName::dst) != -1) { // subtract one from ImmIdx, because the DST operand is usually index // 0 for MachineInstrs, but we have no DST in the Ops vector. ImmIdx--; } // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one // immediate operand, but this is not supported yet. if (ImmReg == AMDGPU::ALU_LITERAL_X) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx)); assert(C); if (C->getZExtValue() != 0) { // This instruction is already using an immediate. continue; } // Set the immediate value Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); } } // Set the immediate register Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); } break; } } SDNode *Result = SelectCode(N); // Fold operands of selected node const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) { bool IsModified = false; do { std::vector<SDValue> Ops; for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); I != E; ++I) Ops.push_back(*I); IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops); if (IsModified) { Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); } } while (IsModified); } if (Result && Result->isMachineOpcode() && !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) && TII->hasInstrModifiers(Result->getMachineOpcode())) { // Fold FNEG/FABS // TODO: Isel can generate multiple MachineInst, we need to recursively // parse Result bool IsModified = false; do { std::vector<SDValue> Ops; for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); I != E; ++I) Ops.push_back(*I); IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); if (IsModified) { Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); } } while (IsModified); // If node has a single use which is CLAMP_R600, folds it if (Result->hasOneUse() && Result->isMachineOpcode()) { SDNode *PotentialClamp = *Result->use_begin(); if (PotentialClamp->isMachineOpcode() && PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) { unsigned ClampIdx = TII->getOperandIdx(Result->getMachineOpcode(), AMDGPU::OpName::clamp); std::vector<SDValue> Ops; unsigned NumOp = Result->getNumOperands(); for (unsigned i = 0; i < NumOp; ++i) { Ops.push_back(Result->getOperand(i)); } Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32); Result = CurDAG->SelectNodeTo(PotentialClamp, Result->getMachineOpcode(), PotentialClamp->getVTList(), Ops.data(), NumOp); } } } } return Result; }
void ScheduleDAGSDNodes::BuildSchedUnits() { // During scheduling, the NodeId field of SDNode is used to map SDNodes // to their associated SUnits by holding SUnits table indices. A value // of -1 means the SDNode does not yet have an associated SUnit. unsigned NumNodes = 0; for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(), E = DAG->allnodes_end(); NI != E; ++NI) { NI->setNodeId(-1); ++NumNodes; } // Reserve entries in the vector for each of the SUnits we are creating. This // ensure that reallocation of the vector won't happen, so SUnit*'s won't get // invalidated. // FIXME: Multiply by 2 because we may clone nodes during scheduling. // This is a temporary workaround. SUnits.reserve(NumNodes * 2); // Add all nodes in depth first order. SmallVector<SDNode*, 64> Worklist; SmallPtrSet<SDNode*, 64> Visited; Worklist.push_back(DAG->getRoot().getNode()); Visited.insert(DAG->getRoot().getNode()); while (!Worklist.empty()) { SDNode *NI = Worklist.pop_back_val(); // Add all operands to the worklist unless they've already been added. for (unsigned i = 0, e = NI->getNumOperands(); i != e; ++i) if (Visited.insert(NI->getOperand(i).getNode())) Worklist.push_back(NI->getOperand(i).getNode()); if (isPassiveNode(NI)) // Leaf node, e.g. a TargetImmediate. continue; // If this node has already been processed, stop now. if (NI->getNodeId() != -1) continue; SUnit *NodeSUnit = NewSUnit(NI); // See if anything is flagged to this node, if so, add them to flagged // nodes. Nodes can have at most one flag input and one flag output. Flags // are required to be the last operand and result of a node. // Scan up to find flagged preds. SDNode *N = NI; while (N->getNumOperands() && N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Flag) { N = N->getOperand(N->getNumOperands()-1).getNode(); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); } // Scan down to find any flagged succs. N = NI; while (N->getValueType(N->getNumValues()-1) == MVT::Flag) { SDValue FlagVal(N, N->getNumValues()-1); // There are either zero or one users of the Flag result. bool HasFlagUse = false; for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) if (FlagVal.isOperandOf(*UI)) { HasFlagUse = true; assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); N = *UI; break; } if (!HasFlagUse) break; } // If there are flag operands involved, N is now the bottom-most node // of the sequence of nodes that are flagged together. // Update the SUnit. NodeSUnit->setNode(N); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); // Assign the Latency field of NodeSUnit using target-provided information. ComputeLatency(NodeSUnit); } }
void ScheduleDAGSDNodes::AddSchedEdges() { const TargetSubtarget &ST = TM.getSubtarget<TargetSubtarget>(); // Check to see if the scheduler cares about latencies. bool UnitLatencies = ForceUnitLatencies(); // Pass 2: add the preds, succs, etc. for (unsigned su = 0, e = SUnits.size(); su != e; ++su) { SUnit *SU = &SUnits[su]; SDNode *MainNode = SU->getNode(); if (MainNode->isMachineOpcode()) { unsigned Opc = MainNode->getMachineOpcode(); const TargetInstrDesc &TID = TII->get(Opc); for (unsigned i = 0; i != TID.getNumOperands(); ++i) { if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) { SU->isTwoAddress = true; break; } } if (TID.isCommutable()) SU->isCommutable = true; } // Find all predecessors and successors of the group. for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode()) { if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).getImplicitDefs()) { SU->hasPhysRegClobbers = true; unsigned NumUsed = InstrEmitter::CountResults(N); while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1)) --NumUsed; // Skip over unused values at the end. if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs()) SU->hasPhysRegDefs = true; } for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *OpN = N->getOperand(i).getNode(); if (isPassiveNode(OpN)) continue; // Not scheduled. SUnit *OpSU = &SUnits[OpN->getNodeId()]; assert(OpSU && "Node has no SUnit!"); if (OpSU == SU) continue; // In the same group. EVT OpVT = N->getOperand(i).getValueType(); assert(OpVT != MVT::Flag && "Flagged nodes should be in same sunit!"); bool isChain = OpVT == MVT::Other; unsigned PhysReg = 0; int Cost = 1; // Determine if this is a physical register dependency. CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost); assert((PhysReg == 0 || !isChain) && "Chain dependence via physreg data?"); // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler // emits a copy from the physical register to a virtual register unless // it requires a cross class copy (cost < 0). That means we are only // treating "expensive to copy" register dependency as physical register // dependency. This may change in the future though. if (Cost >= 0) PhysReg = 0; // If this is a ctrl dep, latency is 1. unsigned OpLatency = isChain ? 1 : OpSU->Latency; const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data, OpLatency, PhysReg); if (!isChain && !UnitLatencies) { ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep)); ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep)); } SU->addPred(dep); } } } }
/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. void InstrEmitter:: EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) { unsigned VRBase = 0; if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { // Just use the input register directly! SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second; (void)isNew; // Silence compiler warning. assert(isNew && "Node emitted out of order - early"); return; } // If the node is only used by a CopyToReg and the dest reg is a vreg, use // the CopyToReg'd destination register instead of creating a new vreg. bool MatchReg = true; const TargetRegisterClass *UseRC = NULL; if (!IsClone && !IsCloned) for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end(); UI != E; ++UI) { SDNode *User = *UI; bool Match = true; if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2).getNode() == Node && User->getOperand(2).getResNo() == ResNo) { unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(DestReg)) { VRBase = DestReg; Match = false; } else if (DestReg != SrcReg) Match = false; } else { for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { SDValue Op = User->getOperand(i); if (Op.getNode() != Node || Op.getResNo() != ResNo) continue; EVT VT = Node->getValueType(Op.getResNo()); if (VT == MVT::Other || VT == MVT::Glue) continue; Match = false; if (User->isMachineOpcode()) { const TargetInstrDesc &II = TII->get(User->getMachineOpcode()); const TargetRegisterClass *RC = 0; if (i+II.getNumDefs() < II.getNumOperands()) RC = II.OpInfo[i+II.getNumDefs()].getRegClass(TRI); if (!UseRC) UseRC = RC; else if (RC) { const TargetRegisterClass *ComRC = getCommonSubClass(UseRC, RC); // If multiple uses expect disjoint register classes, we emit // copies in AddRegisterOperand. if (ComRC) UseRC = ComRC; } } } } MatchReg &= Match; if (VRBase) break; } EVT VT = Node->getValueType(ResNo); const TargetRegisterClass *SrcRC = 0, *DstRC = 0; SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT); // Figure out the register class to create for the destreg. if (VRBase) { DstRC = MRI->getRegClass(VRBase); } else if (UseRC) { assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!"); DstRC = UseRC; } else { DstRC = TLI->getRegClassFor(VT); } // If all uses are reading from the src physical register and copying the // register is either impossible or very expensive, then don't create a copy. if (MatchReg && SrcRC->getCopyCost() < 0) { VRBase = SrcReg; } else { // Create the reg, emit the copy. VRBase = MRI->createVirtualRegister(DstRC); BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg); } SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second; (void)isNew; // Silence compiler warning. assert(isNew && "Node emitted out of order - early"); }
/// EmitMachineNode - Generate machine code for a target-specific node and /// needed dependencies. /// void InstrEmitter:: EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, DenseMap<SDValue, unsigned> &VRBaseMap) { unsigned Opc = Node->getMachineOpcode(); // Handle subreg insert/extract specially if (Opc == TargetOpcode::EXTRACT_SUBREG || Opc == TargetOpcode::INSERT_SUBREG || Opc == TargetOpcode::SUBREG_TO_REG) { EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned); return; } // Handle COPY_TO_REGCLASS specially. if (Opc == TargetOpcode::COPY_TO_REGCLASS) { EmitCopyToRegClassNode(Node, VRBaseMap); return; } // Handle REG_SEQUENCE specially. if (Opc == TargetOpcode::REG_SEQUENCE) { EmitRegSequence(Node, VRBaseMap, IsClone, IsCloned); return; } if (Opc == TargetOpcode::IMPLICIT_DEF) // We want a unique VR for each IMPLICIT_DEF use. return; const TargetInstrDesc &II = TII->get(Opc); unsigned NumResults = CountResults(Node); unsigned NodeOperands = CountOperands(Node); bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0; #ifndef NDEBUG unsigned NumMIOperands = NodeOperands + NumResults; if (II.isVariadic()) assert(NumMIOperands >= II.getNumOperands() && "Too few operands for a variadic node!"); else assert(NumMIOperands >= II.getNumOperands() && NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() && "#operands for dag node doesn't match .td file!"); #endif // Create the new machine instruction. MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), II); // The MachineInstr constructor adds implicit-def operands. Scan through // these to determine which are dead. if (MI->getNumOperands() != 0 && Node->getValueType(Node->getNumValues()-1) == MVT::Glue) { // First, collect all used registers. SmallVector<unsigned, 8> UsedRegs; for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) if (F->getOpcode() == ISD::CopyFromReg) UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg()); else { // Collect declared implicit uses. const TargetInstrDesc &TID = TII->get(F->getMachineOpcode()); UsedRegs.append(TID.getImplicitUses(), TID.getImplicitUses() + TID.getNumImplicitUses()); // In addition to declared implicit uses, we must also check for // direct RegisterSDNode operands. for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i) if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) { unsigned Reg = R->getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) UsedRegs.push_back(Reg); } } // Then mark unused registers as dead. MI->setPhysRegsDeadExcept(UsedRegs, *TRI); } // Add result register values for things that are defined by this // instruction. if (NumResults) CreateVirtualRegisters(Node, MI, II, IsClone, IsCloned, VRBaseMap); // Emit all of the actual operands of this instruction, adding them to the // instruction as appropriate. bool HasOptPRefs = II.getNumDefs() > NumResults; assert((!HasOptPRefs || !HasPhysRegOuts) && "Unable to cope with optional defs and phys regs defs!"); unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0; for (unsigned i = NumSkip; i != NodeOperands; ++i) AddOperand(MI, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II, VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned); // Transfer all of the memory reference descriptions of this instruction. MI->setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(), cast<MachineSDNode>(Node)->memoperands_end()); // Insert the instruction into position in the block. This needs to // happen before any custom inserter hook is called so that the // hook knows where in the block to insert the replacement code. MBB->insert(InsertPos, MI); // Additional results must be physical register defs. if (HasPhysRegOuts) { for (unsigned i = II.getNumDefs(); i < NumResults; ++i) { unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()]; if (Node->hasAnyUseOfValue(i)) EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap); // If there are no uses, mark the register as dead now, so that // MachineLICM/Sink can see that it's dead. Don't do this if the // node has a Glue value, for the benefit of targets still using // Glue for values in physregs. else if (Node->getValueType(Node->getNumValues()-1) != MVT::Glue) MI->addRegisterDead(Reg, TRI); } } // If the instruction has implicit defs and the node doesn't, mark the // implicit def as dead. If the node has any glue outputs, we don't do this // because we don't know what implicit defs are being used by glued nodes. if (Node->getValueType(Node->getNumValues()-1) != MVT::Glue) if (const unsigned *IDList = II.getImplicitDefs()) { for (unsigned i = NumResults, e = II.getNumDefs()+II.getNumImplicitDefs(); i != e; ++i) MI->addRegisterDead(IDList[i-II.getNumDefs()], TRI); } }
/// EmitMachineNode - Generate machine code for a target-specific node and /// needed dependencies. /// void InstrEmitter:: EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, DenseMap<SDValue, unsigned> &VRBaseMap) { unsigned Opc = Node->getMachineOpcode(); // Handle subreg insert/extract specially if (Opc == TargetOpcode::EXTRACT_SUBREG || Opc == TargetOpcode::INSERT_SUBREG || Opc == TargetOpcode::SUBREG_TO_REG) { EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned); return; } // Handle COPY_TO_REGCLASS specially. if (Opc == TargetOpcode::COPY_TO_REGCLASS) { EmitCopyToRegClassNode(Node, VRBaseMap); return; } // Handle REG_SEQUENCE specially. if (Opc == TargetOpcode::REG_SEQUENCE) { EmitRegSequence(Node, VRBaseMap, IsClone, IsCloned); return; } if (Opc == TargetOpcode::IMPLICIT_DEF) // We want a unique VR for each IMPLICIT_DEF use. return; const MCInstrDesc &II = TII->get(Opc); unsigned NumResults = CountResults(Node); unsigned NumDefs = II.getNumDefs(); const MCPhysReg *ScratchRegs = nullptr; // Handle STACKMAP and PATCHPOINT specially and then use the generic code. if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { // Stackmaps do not have arguments and do not preserve their calling // convention. However, to simplify runtime support, they clobber the same // scratch registers as AnyRegCC. unsigned CC = CallingConv::AnyReg; if (Opc == TargetOpcode::PATCHPOINT) { CC = Node->getConstantOperandVal(PatchPointOpers::CCPos); NumDefs = NumResults; } ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC); } unsigned NumImpUses = 0; unsigned NodeOperands = countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses); bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=nullptr; #ifndef NDEBUG unsigned NumMIOperands = NodeOperands + NumResults; if (II.isVariadic()) assert(NumMIOperands >= II.getNumOperands() && "Too few operands for a variadic node!"); else assert(NumMIOperands >= II.getNumOperands() && NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() + NumImpUses && "#operands for dag node doesn't match .td file!"); #endif // Create the new machine instruction. MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II); // Add result register values for things that are defined by this // instruction. if (NumResults) CreateVirtualRegisters(Node, MIB, II, IsClone, IsCloned, VRBaseMap); // Emit all of the actual operands of this instruction, adding them to the // instruction as appropriate. bool HasOptPRefs = NumDefs > NumResults; assert((!HasOptPRefs || !HasPhysRegOuts) && "Unable to cope with optional defs and phys regs defs!"); unsigned NumSkip = HasOptPRefs ? NumDefs - NumResults : 0; for (unsigned i = NumSkip; i != NodeOperands; ++i) AddOperand(MIB, Node->getOperand(i), i-NumSkip+NumDefs, &II, VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned); // Add scratch registers as implicit def and early clobber if (ScratchRegs) for (unsigned i = 0; ScratchRegs[i]; ++i) MIB.addReg(ScratchRegs[i], RegState::ImplicitDefine | RegState::EarlyClobber); // Transfer all of the memory reference descriptions of this instruction. MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(), cast<MachineSDNode>(Node)->memoperands_end()); // Insert the instruction into position in the block. This needs to // happen before any custom inserter hook is called so that the // hook knows where in the block to insert the replacement code. MBB->insert(InsertPos, MIB); // The MachineInstr may also define physregs instead of virtregs. These // physreg values can reach other instructions in different ways: // // 1. When there is a use of a Node value beyond the explicitly defined // virtual registers, we emit a CopyFromReg for one of the implicitly // defined physregs. This only happens when HasPhysRegOuts is true. // // 2. A CopyFromReg reading a physreg may be glued to this instruction. // // 3. A glued instruction may implicitly use a physreg. // // 4. A glued instruction may use a RegisterSDNode operand. // // Collect all the used physreg defs, and make sure that any unused physreg // defs are marked as dead. SmallVector<unsigned, 8> UsedRegs; // Additional results must be physical register defs. if (HasPhysRegOuts) { for (unsigned i = NumDefs; i < NumResults; ++i) { unsigned Reg = II.getImplicitDefs()[i - NumDefs]; if (!Node->hasAnyUseOfValue(i)) continue; // This implicitly defined physreg has a use. UsedRegs.push_back(Reg); EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap); } } // Scan the glue chain for any used physregs. if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) { for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) { if (F->getOpcode() == ISD::CopyFromReg) { UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg()); continue; } else if (F->getOpcode() == ISD::CopyToReg) { // Skip CopyToReg nodes that are internal to the glue chain. continue; } // Collect declared implicit uses. const MCInstrDesc &MCID = TII->get(F->getMachineOpcode()); UsedRegs.append(MCID.getImplicitUses(), MCID.getImplicitUses() + MCID.getNumImplicitUses()); // In addition to declared implicit uses, we must also check for // direct RegisterSDNode operands. for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i) if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) { unsigned Reg = R->getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) UsedRegs.push_back(Reg); } } } // Finally mark unused registers as dead. if (!UsedRegs.empty() || II.getImplicitDefs()) MIB->setPhysRegsDeadExcept(UsedRegs, *TRI); // Run post-isel target hook to adjust this instruction if needed. if (II.hasPostISelHook()) TLI->AdjustInstrPostInstrSelection(MIB, Node); }
/// run - This is the main entry point for the type legalizer. This does a /// top-down traversal of the dag, legalizing types as it goes. Returns "true" /// if it made any changes. bool DAGTypeLegalizer::run() { bool Changed = false; // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any // changes of the root. HandleSDNode Dummy(DAG.getRoot()); Dummy.setNodeId(Unanalyzed); // The root of the dag may dangle to deleted nodes until the type legalizer is // done. Set it to null to avoid confusion. DAG.setRoot(SDValue()); // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess' // (and remembering them) if they are leaves and assigning 'Unanalyzed' if // non-leaves. for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) { if (I->getNumOperands() == 0) { I->setNodeId(ReadyToProcess); Worklist.push_back(I); } else { I->setNodeId(Unanalyzed); } } // Now that we have a set of nodes to process, handle them all. while (!Worklist.empty()) { #ifndef XDEBUG if (EnableExpensiveChecks) #endif PerformExpensiveChecks(); SDNode *N = Worklist.back(); Worklist.pop_back(); assert(N->getNodeId() == ReadyToProcess && "Node should be ready if on worklist!"); if (IgnoreNodeResults(N)) goto ScanOperands; // Scan the values produced by the node, checking to see if any result // types are illegal. for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) { EVT ResultVT = N->getValueType(i); switch (getTypeAction(ResultVT)) { case TargetLowering::TypeLegal: break; // The following calls must take care of *all* of the node's results, // not just the illegal result they were passed (this includes results // with a legal type). Results can be remapped using ReplaceValueWith, // or their promoted/expanded/etc values registered in PromotedIntegers, // ExpandedIntegers etc. case TargetLowering::TypePromoteInteger: PromoteIntegerResult(N, i); Changed = true; goto NodeDone; case TargetLowering::TypeExpandInteger: ExpandIntegerResult(N, i); Changed = true; goto NodeDone; case TargetLowering::TypeSoftenFloat: SoftenFloatResult(N, i); Changed = true; goto NodeDone; case TargetLowering::TypeExpandFloat: ExpandFloatResult(N, i); Changed = true; goto NodeDone; case TargetLowering::TypeScalarizeVector: ScalarizeVectorResult(N, i); Changed = true; goto NodeDone; case TargetLowering::TypeSplitVector: SplitVectorResult(N, i); Changed = true; goto NodeDone; case TargetLowering::TypeWidenVector: WidenVectorResult(N, i); Changed = true; goto NodeDone; } } ScanOperands: // Scan the operand list for the node, handling any nodes with operands that // are illegal. { unsigned NumOperands = N->getNumOperands(); bool NeedsReanalyzing = false; unsigned i; for (i = 0; i != NumOperands; ++i) { if (IgnoreNodeResults(N->getOperand(i).getNode())) continue; EVT OpVT = N->getOperand(i).getValueType(); switch (getTypeAction(OpVT)) { case TargetLowering::TypeLegal: continue; // The following calls must either replace all of the node's results // using ReplaceValueWith, and return "false"; or update the node's // operands in place, and return "true". case TargetLowering::TypePromoteInteger: NeedsReanalyzing = PromoteIntegerOperand(N, i); Changed = true; break; case TargetLowering::TypeExpandInteger: NeedsReanalyzing = ExpandIntegerOperand(N, i); Changed = true; break; case TargetLowering::TypeSoftenFloat: NeedsReanalyzing = SoftenFloatOperand(N, i); Changed = true; break; case TargetLowering::TypeExpandFloat: NeedsReanalyzing = ExpandFloatOperand(N, i); Changed = true; break; case TargetLowering::TypeScalarizeVector: NeedsReanalyzing = ScalarizeVectorOperand(N, i); Changed = true; break; case TargetLowering::TypeSplitVector: NeedsReanalyzing = SplitVectorOperand(N, i); Changed = true; break; case TargetLowering::TypeWidenVector: NeedsReanalyzing = WidenVectorOperand(N, i); Changed = true; break; } break; } // The sub-method updated N in place. Check to see if any operands are new, // and if so, mark them. If the node needs revisiting, don't add all users // to the worklist etc. if (NeedsReanalyzing) { assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?"); N->setNodeId(NewNode); // Recompute the NodeId and correct processed operands, adding the node to // the worklist if ready. SDNode *M = AnalyzeNewNode(N); if (M == N) // The node didn't morph - nothing special to do, it will be revisited. continue; // The node morphed - this is equivalent to legalizing by replacing every // value of N with the corresponding value of M. So do that now. assert(N->getNumValues() == M->getNumValues() && "Node morphing changed the number of results!"); for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) // Replacing the value takes care of remapping the new value. ReplaceValueWith(SDValue(N, i), SDValue(M, i)); assert(N->getNodeId() == NewNode && "Unexpected node state!"); // The node continues to live on as part of the NewNode fungus that // grows on top of the useful nodes. Nothing more needs to be done // with it - move on to the next node. continue; } if (i == NumOperands) { DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG); dbgs() << "\n"); } } NodeDone: // If we reach here, the node was processed, potentially creating new nodes. // Mark it as processed and add its users to the worklist as appropriate. assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?"); N->setNodeId(Processed); for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) { SDNode *User = *UI; int NodeId = User->getNodeId(); // This node has two options: it can either be a new node or its Node ID // may be a count of the number of operands it has that are not ready. if (NodeId > 0) { User->setNodeId(NodeId-1); // If this was the last use it was waiting on, add it to the ready list. if (NodeId-1 == ReadyToProcess) Worklist.push_back(User); continue; } // If this is an unreachable new node, then ignore it. If it ever becomes // reachable by being used by a newly created node then it will be handled // by AnalyzeNewNode. if (NodeId == NewNode) continue; // Otherwise, this node is new: this is the first operand of it that // became ready. Its new NodeId is the number of operands it has minus 1 // (as this node is now processed). assert(NodeId == Unanalyzed && "Unknown node ID!"); User->setNodeId(User->getNumOperands() - 1); // If the node only has a single operand, it is now ready. if (User->getNumOperands() == 1) Worklist.push_back(User); } }
void ScheduleDAGSDNodes::AddSchedEdges() { const TargetSubtargetInfo &ST = MF.getSubtarget(); // Check to see if the scheduler cares about latencies. bool UnitLatencies = forceUnitLatencies(); // Pass 2: add the preds, succs, etc. for (unsigned su = 0, e = SUnits.size(); su != e; ++su) { SUnit *SU = &SUnits[su]; SDNode *MainNode = SU->getNode(); if (MainNode->isMachineOpcode()) { unsigned Opc = MainNode->getMachineOpcode(); const MCInstrDesc &MCID = TII->get(Opc); for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { SU->isTwoAddress = true; break; } } if (MCID.isCommutable()) SU->isCommutable = true; } // Find all predecessors and successors of the group. for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) { if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).getImplicitDefs()) { SU->hasPhysRegClobbers = true; unsigned NumUsed = InstrEmitter::CountResults(N); while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1)) --NumUsed; // Skip over unused values at the end. if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs()) SU->hasPhysRegDefs = true; } for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *OpN = N->getOperand(i).getNode(); if (isPassiveNode(OpN)) continue; // Not scheduled. SUnit *OpSU = &SUnits[OpN->getNodeId()]; assert(OpSU && "Node has no SUnit!"); if (OpSU == SU) continue; // In the same group. EVT OpVT = N->getOperand(i).getValueType(); assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); bool isChain = OpVT == MVT::Other; unsigned PhysReg = 0; int Cost = 1; // Determine if this is a physical register dependency. CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost); assert((PhysReg == 0 || !isChain) && "Chain dependence via physreg data?"); // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler // emits a copy from the physical register to a virtual register unless // it requires a cross class copy (cost < 0). That means we are only // treating "expensive to copy" register dependency as physical register // dependency. This may change in the future though. if (Cost >= 0 && !StressSched) PhysReg = 0; // If this is a ctrl dep, latency is 1. unsigned OpLatency = isChain ? 1 : OpSU->Latency; // Special-case TokenFactor chains as zero-latency. if(isChain && OpN->getOpcode() == ISD::TokenFactor) OpLatency = 0; SDep Dep = isChain ? SDep(OpSU, SDep::Barrier) : SDep(OpSU, SDep::Data, PhysReg); Dep.setLatency(OpLatency); if (!isChain && !UnitLatencies) { computeOperandLatency(OpN, N, i, Dep); ST.adjustSchedDependency(OpSU, SU, Dep); } if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) { // Multiple register uses are combined in the same SUnit. For example, // we could have a set of glued nodes with all their defs consumed by // another set of glued nodes. Register pressure tracking sees this as // a single use, so to keep pressure balanced we reduce the defs. // // We can't tell (without more book-keeping) if this results from // glued nodes or duplicate operands. As long as we don't reduce // NumRegDefsLeft to zero, we handle the common cases well. --OpSU->NumRegDefsLeft; } } } } }
void ScheduleDAGSDNodes::BuildSchedUnits() { // During scheduling, the NodeId field of SDNode is used to map SDNodes // to their associated SUnits by holding SUnits table indices. A value // of -1 means the SDNode does not yet have an associated SUnit. unsigned NumNodes = 0; for (SDNode &NI : DAG->allnodes()) { NI.setNodeId(-1); ++NumNodes; } // Reserve entries in the vector for each of the SUnits we are creating. This // ensure that reallocation of the vector won't happen, so SUnit*'s won't get // invalidated. // FIXME: Multiply by 2 because we may clone nodes during scheduling. // This is a temporary workaround. SUnits.reserve(NumNodes * 2); // Add all nodes in depth first order. SmallVector<SDNode*, 64> Worklist; SmallPtrSet<SDNode*, 32> Visited; Worklist.push_back(DAG->getRoot().getNode()); Visited.insert(DAG->getRoot().getNode()); SmallVector<SUnit*, 8> CallSUnits; while (!Worklist.empty()) { SDNode *NI = Worklist.pop_back_val(); // Add all operands to the worklist unless they've already been added. for (const SDValue &Op : NI->op_values()) if (Visited.insert(Op.getNode()).second) Worklist.push_back(Op.getNode()); if (isPassiveNode(NI)) // Leaf node, e.g. a TargetImmediate. continue; // If this node has already been processed, stop now. if (NI->getNodeId() != -1) continue; SUnit *NodeSUnit = newSUnit(NI); // See if anything is glued to this node, if so, add them to glued // nodes. Nodes can have at most one glue input and one glue output. Glue // is required to be the last operand and result of a node. // Scan up to find glued preds. SDNode *N = NI; while (N->getNumOperands() && N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) { N = N->getOperand(N->getNumOperands()-1).getNode(); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall()) NodeSUnit->isCall = true; } // Scan down to find any glued succs. N = NI; while (N->getValueType(N->getNumValues()-1) == MVT::Glue) { SDValue GlueVal(N, N->getNumValues()-1); // There are either zero or one users of the Glue result. bool HasGlueUse = false; for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) if (GlueVal.isOperandOf(*UI)) { HasGlueUse = true; assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); N = *UI; if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall()) NodeSUnit->isCall = true; break; } if (!HasGlueUse) break; } if (NodeSUnit->isCall) CallSUnits.push_back(NodeSUnit); // Schedule zero-latency TokenFactor below any nodes that may increase the // schedule height. Otherwise, ancestors of the TokenFactor may appear to // have false stalls. if (NI->getOpcode() == ISD::TokenFactor) NodeSUnit->isScheduleLow = true; // If there are glue operands involved, N is now the bottom-most node // of the sequence of nodes that are glued together. // Update the SUnit. NodeSUnit->setNode(N); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); // Compute NumRegDefsLeft. This must be done before AddSchedEdges. InitNumRegDefsLeft(NodeSUnit); // Assign the Latency field of NodeSUnit using target-provided information. computeLatency(NodeSUnit); } // Find all call operands. while (!CallSUnits.empty()) { SUnit *SU = CallSUnits.pop_back_val(); for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->getOpcode() != ISD::CopyToReg) continue; SDNode *SrcN = SUNode->getOperand(2).getNode(); if (isPassiveNode(SrcN)) continue; // Not scheduled. SUnit *SrcSU = &SUnits[SrcN->getNodeId()]; SrcSU->isCallOp = true; } } }
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { return NULL; // Already selected. } switch (Opc) { default: break; case ISD::BUILD_VECTOR: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { break; } // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) }; bool IsRegSeq = true; for (unsigned i = 0; i < N->getNumOperands(); i++) { if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; break; } RegSeqArgs[2 * i + 1] = N->getOperand(i); } if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs, 2 * N->getNumOperands() + 1); } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); // XXX: Custom immediate lowering not implemented yet. Instead we use // pseudo instructions defined in SIInstructions.td if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { break; } const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); uint64_t ImmValue = 0; unsigned ImmReg = AMDGPU::ALU_LITERAL_X; if (N->getOpcode() == ISD::ConstantFP) { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::f64); ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N); APFloat Value = C->getValueAPF(); float FloatValue = Value.convertToFloat(); if (FloatValue == 0.0) { ImmReg = AMDGPU::ZERO; } else if (FloatValue == 0.5) { ImmReg = AMDGPU::HALF; } else if (FloatValue == 1.0) { ImmReg = AMDGPU::ONE; } else { ImmValue = Value.bitcastToAPInt().getZExtValue(); } } else { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::i64); ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); if (C->getZExtValue() == 0) { ImmReg = AMDGPU::ZERO; } else if (C->getZExtValue() == 1) { ImmReg = AMDGPU::ONE_INT; } else { ImmValue = C->getZExtValue(); } } for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); Use != SDNode::use_end(); Use = Next) { Next = llvm::next(Use); std::vector<SDValue> Ops; for (unsigned i = 0; i < Use->getNumOperands(); ++i) { Ops.push_back(Use->getOperand(i)); } if (!Use->isMachineOpcode()) { if (ImmReg == AMDGPU::ALU_LITERAL_X) { // We can only use literal constants (e.g. AMDGPU::ZERO, // AMDGPU::ONE, etc) in machine opcodes. continue; } } else { if (!TII->isALUInstr(Use->getMachineOpcode()) || (TII->get(Use->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)) { continue; } int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM); assert(ImmIdx != -1); // subtract one from ImmIdx, because the DST operand is usually index // 0 for MachineInstrs, but we have no DST in the Ops vector. ImmIdx--; // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one // immediate operand, but this is not supported yet. if (ImmReg == AMDGPU::ALU_LITERAL_X) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx)); assert(C); if (C->getZExtValue() != 0) { // This instruction is already using an immediate. continue; } // Set the immediate value Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); } } // Set the immediate register Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); } break; } } SDNode *Result = SelectCode(N); // Fold operands of selected node const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); if (Result && Result->isMachineOpcode() && !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) && TII->isALUInstr(Result->getMachineOpcode())) { // Fold FNEG/FABS/CONST_ADDRESS // TODO: Isel can generate multiple MachineInst, we need to recursively // parse Result bool IsModified = false; do { std::vector<SDValue> Ops; for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); I != E; ++I) Ops.push_back(*I); IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); if (IsModified) { Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); } } while (IsModified); // If node has a single use which is CLAMP_R600, folds it if (Result->hasOneUse() && Result->isMachineOpcode()) { SDNode *PotentialClamp = *Result->use_begin(); if (PotentialClamp->isMachineOpcode() && PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) { unsigned ClampIdx = TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP); std::vector<SDValue> Ops; unsigned NumOp = Result->getNumOperands(); for (unsigned i = 0; i < NumOp; ++i) { Ops.push_back(Result->getOperand(i)); } Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32); Result = CurDAG->SelectNodeTo(PotentialClamp, Result->getMachineOpcode(), PotentialClamp->getVTList(), Ops.data(), NumOp); } } } } return Result; }
SDValue SelectionDAGBuilder::LowerAsSTATEPOINT( SelectionDAGBuilder::StatepointLoweringInfo &SI) { // The basic scheme here is that information about both the original call and // the safepoint is encoded in the CallInst. We create a temporary call and // lower it, then reverse engineer the calling sequence. NumOfStatepoints++; // Clear state StatepointLowering.startNewStatepoint(*this); #ifndef NDEBUG // We schedule gc relocates before removeDuplicateGCPtrs since we _will_ // encounter the duplicate gc relocates we elide in removeDuplicateGCPtrs. for (auto *Reloc : SI.GCRelocates) if (Reloc->getParent() == SI.StatepointInstr->getParent()) StatepointLowering.scheduleRelocCall(*Reloc); #endif // Remove any redundant llvm::Values which map to the same SDValue as another // input. Also has the effect of removing duplicates in the original // llvm::Value input list as well. This is a useful optimization for // reducing the size of the StackMap section. It has no other impact. removeDuplicateGCPtrs(SI.Bases, SI.Ptrs, SI.GCRelocates, *this, FuncInfo.StatepointSpillMaps[SI.StatepointInstr]); assert(SI.Bases.size() == SI.Ptrs.size() && SI.Ptrs.size() == SI.GCRelocates.size()); // Lower statepoint vmstate and gcstate arguments SmallVector<SDValue, 10> LoweredMetaArgs; lowerStatepointMetaArgs(LoweredMetaArgs, SI, *this); // Now that we've emitted the spills, we need to update the root so that the // call sequence is ordered correctly. SI.CLI.setChain(getRoot()); // Get call node, we will replace it later with statepoint SDValue ReturnVal; SDNode *CallNode; std::tie(ReturnVal, CallNode) = lowerCallFromStatepointLoweringInfo(SI, *this, PendingExports); // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END // nodes with all the appropriate arguments and return values. // Call Node: Chain, Target, {Args}, RegMask, [Glue] SDValue Chain = CallNode->getOperand(0); SDValue Glue; bool CallHasIncomingGlue = CallNode->getGluedNode(); if (CallHasIncomingGlue) { // Glue is always last operand Glue = CallNode->getOperand(CallNode->getNumOperands() - 1); } // Build the GC_TRANSITION_START node if necessary. // // The operands to the GC_TRANSITION_{START,END} nodes are laid out in the // order in which they appear in the call to the statepoint intrinsic. If // any of the operands is a pointer-typed, that operand is immediately // followed by a SRCVALUE for the pointer that may be used during lowering // (e.g. to form MachinePointerInfo values for loads/stores). const bool IsGCTransition = (SI.StatepointFlags & (uint64_t)StatepointFlags::GCTransition) == (uint64_t)StatepointFlags::GCTransition; if (IsGCTransition) { SmallVector<SDValue, 8> TSOps; // Add chain TSOps.push_back(Chain); // Add GC transition arguments for (const Value *V : SI.GCTransitionArgs) { TSOps.push_back(getValue(V)); if (V->getType()->isPointerTy()) TSOps.push_back(DAG.getSrcValue(V)); } // Add glue if necessary if (CallHasIncomingGlue) TSOps.push_back(Glue); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue GCTransitionStart = DAG.getNode(ISD::GC_TRANSITION_START, getCurSDLoc(), NodeTys, TSOps); Chain = GCTransitionStart.getValue(0); Glue = GCTransitionStart.getValue(1); } // TODO: Currently, all of these operands are being marked as read/write in // PrologEpilougeInserter.cpp, we should special case the VMState arguments // and flags to be read-only. SmallVector<SDValue, 40> Ops; // Add the <id> and <numBytes> constants. Ops.push_back(DAG.getTargetConstant(SI.ID, getCurSDLoc(), MVT::i64)); Ops.push_back( DAG.getTargetConstant(SI.NumPatchBytes, getCurSDLoc(), MVT::i32)); // Calculate and push starting position of vmstate arguments // Get number of arguments incoming directly into call node unsigned NumCallRegArgs = CallNode->getNumOperands() - (CallHasIncomingGlue ? 4 : 3); Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, getCurSDLoc(), MVT::i32)); // Add call target SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0); Ops.push_back(CallTarget); // Add call arguments // Get position of register mask in the call SDNode::op_iterator RegMaskIt; if (CallHasIncomingGlue) RegMaskIt = CallNode->op_end() - 2; else RegMaskIt = CallNode->op_end() - 1; Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt); // Add a constant argument for the calling convention pushStackMapConstant(Ops, *this, SI.CLI.CallConv); // Add a constant argument for the flags uint64_t Flags = SI.StatepointFlags; assert(((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0) && "Unknown flag used"); pushStackMapConstant(Ops, *this, Flags); // Insert all vmstate and gcstate arguments Ops.insert(Ops.end(), LoweredMetaArgs.begin(), LoweredMetaArgs.end()); // Add register mask from call node Ops.push_back(*RegMaskIt); // Add chain Ops.push_back(Chain); // Same for the glue, but we add it only if original call had it if (Glue.getNode()) Ops.push_back(Glue); // Compute return values. Provide a glue output since we consume one as // input. This allows someone else to chain off us as needed. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops); SDNode *SinkNode = StatepointMCNode; // Build the GC_TRANSITION_END node if necessary. // // See the comment above regarding GC_TRANSITION_START for the layout of // the operands to the GC_TRANSITION_END node. if (IsGCTransition) { SmallVector<SDValue, 8> TEOps; // Add chain TEOps.push_back(SDValue(StatepointMCNode, 0)); // Add GC transition arguments for (const Value *V : SI.GCTransitionArgs) { TEOps.push_back(getValue(V)); if (V->getType()->isPointerTy()) TEOps.push_back(DAG.getSrcValue(V)); } // Add glue TEOps.push_back(SDValue(StatepointMCNode, 1)); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue GCTransitionStart = DAG.getNode(ISD::GC_TRANSITION_END, getCurSDLoc(), NodeTys, TEOps); SinkNode = GCTransitionStart.getNode(); } // Replace original call DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root // Remove original call node DAG.DeleteNode(CallNode); // DON'T set the root - under the assumption that it's already set past the // inserted node we created. // TODO: A better future implementation would be to emit a single variable // argument, variable return value STATEPOINT node here and then hookup the // return value of each gc.relocate to the respective output of the // previously emitted STATEPOINT value. Unfortunately, this doesn't appear // to actually be possible today. return ReturnVal; }
/// Return true if this node is so simple that we should just print it inline /// if it appears as an operand. static bool shouldPrintInline(const SDNode &Node) { if (Node.getOpcode() == ISD::EntryToken) return false; return Node.getNumOperands() == 0; }