void ScheduleDAGSDNodes::computeLatency(SUnit *SU) { SDNode *N = SU->getNode(); // TokenFactor operands are considered zero latency, and some schedulers // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero // whenever node latency is nonzero. if (N && N->getOpcode() == ISD::TokenFactor) { SU->Latency = 0; return; } // Check to see if the scheduler cares about latencies. if (forceUnitLatencies()) { SU->Latency = 1; return; } if (!InstrItins || InstrItins->isEmpty()) { if (N && N->isMachineOpcode() && TII->isHighLatencyDef(N->getMachineOpcode())) SU->Latency = HighLatencyCycles; else SU->Latency = 1; return; } // Compute the latency for the node. We use the sum of the latencies for // all nodes glued together into this SUnit. SU->Latency = 0; for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) if (N->isMachineOpcode()) SU->Latency += TII->getInstrLatency(InstrItins, N); }
void ResourcePriorityQueue::initNumRegDefsLeft(SUnit *SU) { unsigned NodeNumDefs = 0; for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) if (N->isMachineOpcode()) { const MCInstrDesc &TID = TII->get(N->getMachineOpcode()); // No register need be allocated for this. if (N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) { NodeNumDefs = 0; break; } NodeNumDefs = std::min(N->getNumValues(), TID.getNumDefs()); } else switch(N->getOpcode()) { default: break; case ISD::CopyFromReg: NodeNumDefs++; break; case ISD::INLINEASM: NodeNumDefs++; break; } SU->NumRegDefsLeft = NodeNumDefs; }
void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { const InstrItineraryData &InstrItins = TM.getInstrItineraryData(); // Compute the latency for the node. We use the sum of the latencies for // all nodes flagged together into this SUnit. SU->Latency = 0; for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode()) if (N->isMachineOpcode()) { SU->Latency += InstrItins. getStageLatency(TII->get(N->getMachineOpcode()).getSchedClass()); } }
/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay /// scheduling of the given node to satisfy live physical register dependencies. /// If the specific node is the last one that's available to schedule, do /// whatever is necessary (i.e. backtracking or cloning) to make it possible. bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs){ if (NumLiveRegs == 0) return false; SmallSet<unsigned, 4> RegAdded; // If this node would clobber any "live" register, then it's not ready. for (SDep &Pred : SU->Preds) { if (Pred.isAssignedRegDep()) { CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs, RegAdded, LRegs, TRI); } } for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) { if (Node->getOpcode() == ISD::INLINEASM) { // Inline asm can clobber physical defs. unsigned NumOps = Node->getNumOperands(); if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue) --NumOps; // Ignore the glue operand. for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) { unsigned Flags = cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue(); unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); ++i; // Skip the ID value. if (InlineAsm::isRegDefKind(Flags) || InlineAsm::isRegDefEarlyClobberKind(Flags) || InlineAsm::isClobberKind(Flags)) { // Check for def of register or earlyclobber register. for (; NumVals; --NumVals, ++i) { unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI); } } else i += NumVals; } continue; } if (!Node->isMachineOpcode()) continue; const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); if (!MCID.ImplicitDefs) continue; for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) { CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI); } } return !LRegs.empty(); }
/// ClusterNodes - Cluster certain nodes which should be scheduled together. /// void ScheduleDAGSDNodes::ClusterNodes() { for (SDNode &NI : DAG->allnodes()) { SDNode *Node = &NI; if (!Node || !Node->isMachineOpcode()) continue; unsigned Opc = Node->getMachineOpcode(); const MCInstrDesc &MCID = TII->get(Opc); if (MCID.mayLoad()) // Cluster loads from "near" addresses into combined SUnits. ClusterNeighboringLoads(Node); } }
SDNode *VDAGToDAGISel::SelectBitSlice(SDNode *N) { SDNode *Op = N->getOperand(0).getNode(); // Emit the constant bit slice to constant directly if possible. if (ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Op)) return SelectConstBitSlice(CSD, N); assert(Op->getOpcode() != VTMISD::BitSlice && (!Op->isMachineOpcode() || Op->getMachineOpcode() != VTM::VOpBitSlice) && "DAGCombine should handle this!"); return SelectSimpleNode(N, VTM::VOpBitSlice); }
/// ClusterNodes - Cluster certain nodes which should be scheduled together. /// void ScheduleDAGSDNodes::ClusterNodes() { for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(), E = DAG->allnodes_end(); NI != E; ++NI) { SDNode *Node = &*NI; if (!Node || !Node->isMachineOpcode()) continue; unsigned Opc = Node->getMachineOpcode(); const MCInstrDesc &MCID = TII->get(Opc); if (MCID.mayLoad()) // Cluster loads from "near" addresses into combined SUnits. ClusterNeighboringLoads(Node); } }
SDNode *SparcDAGToDAGISel::Select(SDValue Op) { SDNode *N = Op.getNode(); DebugLoc dl = N->getDebugLoc(); if (N->isMachineOpcode()) return NULL; // Already selected. switch (N->getOpcode()) { default: break; case SPISD::GLOBAL_BASE_REG: return getGlobalBaseReg(); case ISD::SDIV: case ISD::UDIV: { // FIXME: should use a custom expander to expose the SRA to the dag. SDValue DivLHS = N->getOperand(0); SDValue DivRHS = N->getOperand(1); // Set the Y register to the high-part. SDValue TopPart; if (N->getOpcode() == ISD::SDIV) { TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS, CurDAG->getTargetConstant(31, MVT::i32)), 0); } else { TopPart = CurDAG->getRegister(SP::G0, MVT::i32); } TopPart = SDValue(CurDAG->getMachineNode(SP::WRYrr, dl, MVT::Flag, TopPart, CurDAG->getRegister(SP::G0, MVT::i32)), 0); // FIXME: Handle div by immediate. unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr; return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, TopPart); } case ISD::MULHU: case ISD::MULHS: { // FIXME: Handle mul by immediate. SDValue MulLHS = N->getOperand(0); SDValue MulRHS = N->getOperand(1); unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr; SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Flag, MulLHS, MulRHS); // The high part is in the Y register. return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDValue(Mul, 1)); return NULL; } } return SelectCode(Op); }
void ScheduleDAGLinearize::Schedule() { LLVM_DEBUG(dbgs() << "********** DAG Linearization **********\n"); SmallVector<SDNode*, 8> Glues; unsigned DAGSize = 0; for (SDNode &Node : DAG->allnodes()) { SDNode *N = &Node; // Use node id to record degree. unsigned Degree = N->use_size(); N->setNodeId(Degree); unsigned NumVals = N->getNumValues(); if (NumVals && N->getValueType(NumVals-1) == MVT::Glue && N->hasAnyUseOfValue(NumVals-1)) { SDNode *User = findGluedUser(N); if (User) { Glues.push_back(N); GluedMap.insert(std::make_pair(N, User)); } } if (N->isMachineOpcode() || (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N))) ++DAGSize; } for (unsigned i = 0, e = Glues.size(); i != e; ++i) { SDNode *Glue = Glues[i]; SDNode *GUser = GluedMap[Glue]; unsigned Degree = Glue->getNodeId(); unsigned UDegree = GUser->getNodeId(); // Glue user must be scheduled together with the glue operand. So other // users of the glue operand must be treated as its users. SDNode *ImmGUser = Glue->getGluedUser(); for (const SDNode *U : Glue->uses()) if (U == ImmGUser) --Degree; GUser->setNodeId(UDegree + Degree); Glue->setNodeId(1); } Sequence.reserve(DAGSize); ScheduleNode(DAG->getRoot().getNode()); }
void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { // Check to see if the scheduler cares about latencies. if (ForceUnitLatencies()) { SU->Latency = 1; return; } if (!InstrItins || InstrItins->isEmpty()) { SU->Latency = 1; return; } // Compute the latency for the node. We use the sum of the latencies for // all nodes glued together into this SUnit. SU->Latency = 0; for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) if (N->isMachineOpcode()) SU->Latency += TII->getInstrLatency(InstrItins, N); }
void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { // Check to see if the scheduler cares about latencies. if (ForceUnitLatencies()) { SU->Latency = 1; return; } const InstrItineraryData &InstrItins = TM.getInstrItineraryData(); if (InstrItins.isEmpty()) { SU->Latency = 1; return; } // Compute the latency for the node. We use the sum of the latencies for // all nodes flagged together into this SUnit. SU->Latency = 0; for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode()) if (N->isMachineOpcode()) { SU->Latency += InstrItins. getStageLatency(TII->get(N->getMachineOpcode()).getSchedClass()); } }
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { return NULL; // Already selected. } switch (Opc) { default: break; case AMDGPUISD::CONST_ADDRESS: { for (SDNode::use_iterator I = N->use_begin(), Next = llvm::next(I); I != SDNode::use_end(); I = Next) { Next = llvm::next(I); if (!I->isMachineOpcode()) { continue; } unsigned Opcode = I->getMachineOpcode(); bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; int SrcIdx = I.getOperandNo(); int SelIdx; // Unlike MachineInstrs, SDNodes do not have results in their operand // list, so we need to increment the SrcIdx, since // R600InstrInfo::getOperandIdx is based on the MachineInstr indices. if (HasDst) { SrcIdx++; } SelIdx = TII->getSelIdx(I->getMachineOpcode(), SrcIdx); if (SelIdx < 0) { continue; } SDValue CstOffset; if (N->getValueType(0).isVector() || !SelectGlobalValueConstantOffset(N->getOperand(0), CstOffset)) continue; // Gather constants values int SrcIndices[] = { TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) }; std::vector<unsigned> Consts; for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) { int OtherSrcIdx = SrcIndices[i]; int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); if (OtherSrcIdx < 0 || OtherSelIdx < 0) { continue; } if (HasDst) { OtherSrcIdx--; OtherSelIdx--; } if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(I->getOperand(OtherSrcIdx))) { if (Reg->getReg() == AMDGPU::ALU_CONST) { ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(I->getOperand(OtherSelIdx)); Consts.push_back(Cst->getZExtValue()); } } } ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); Consts.push_back(Cst->getZExtValue()); if (!TII->fitsConstReadLimitations(Consts)) continue; // Convert back to SDNode indices if (HasDst) { SrcIdx--; SelIdx--; } std::vector<SDValue> Ops; for (int i = 0, e = I->getNumOperands(); i != e; ++i) { if (i == SrcIdx) { Ops.push_back(CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32)); } else if (i == SelIdx) { Ops.push_back(CstOffset); } else { Ops.push_back(I->getOperand(i)); } } CurDAG->UpdateNodeOperands(*I, Ops.data(), Ops.size()); } break; } case ISD::BUILD_VECTOR: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } unsigned RegClassID; switch(N->getValueType(0).getVectorNumElements()) { case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { CurDAG->getTargetConstant(RegClassID, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) }; bool IsRegSeq = true; for (unsigned i = 0; i < N->getNumOperands(); i++) { if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; break; } RegSeqArgs[2 * i + 1] = N->getOperand(i); } if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs, 2 * N->getNumOperands() + 1); } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } if (N->getValueType(0) == MVT::i128) { RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32); } else if (N->getValueType(0) == MVT::i64) { RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32); SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); } else { llvm_unreachable("Unhandled value type for BUILD_PAIR"); } const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, N->getOperand(1), SubReg1 }; return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N), N->getValueType(0), Ops); } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); // XXX: Custom immediate lowering not implemented yet. Instead we use // pseudo instructions defined in SIInstructions.td if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { break; } uint64_t ImmValue = 0; unsigned ImmReg = AMDGPU::ALU_LITERAL_X; if (N->getOpcode() == ISD::ConstantFP) { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::f64); ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N); APFloat Value = C->getValueAPF(); float FloatValue = Value.convertToFloat(); if (FloatValue == 0.0) { ImmReg = AMDGPU::ZERO; } else if (FloatValue == 0.5) { ImmReg = AMDGPU::HALF; } else if (FloatValue == 1.0) { ImmReg = AMDGPU::ONE; } else { ImmValue = Value.bitcastToAPInt().getZExtValue(); } } else { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::i64); ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); if (C->getZExtValue() == 0) { ImmReg = AMDGPU::ZERO; } else if (C->getZExtValue() == 1) { ImmReg = AMDGPU::ONE_INT; } else { ImmValue = C->getZExtValue(); } } for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); Use != SDNode::use_end(); Use = Next) { Next = llvm::next(Use); std::vector<SDValue> Ops; for (unsigned i = 0; i < Use->getNumOperands(); ++i) { Ops.push_back(Use->getOperand(i)); } if (!Use->isMachineOpcode()) { if (ImmReg == AMDGPU::ALU_LITERAL_X) { // We can only use literal constants (e.g. AMDGPU::ZERO, // AMDGPU::ONE, etc) in machine opcodes. continue; } } else { if (!TII->isALUInstr(Use->getMachineOpcode()) || (TII->get(Use->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)) { continue; } int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), AMDGPU::OpName::literal); if (ImmIdx == -1) { continue; } if (TII->getOperandIdx(Use->getMachineOpcode(), AMDGPU::OpName::dst) != -1) { // subtract one from ImmIdx, because the DST operand is usually index // 0 for MachineInstrs, but we have no DST in the Ops vector. ImmIdx--; } // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one // immediate operand, but this is not supported yet. if (ImmReg == AMDGPU::ALU_LITERAL_X) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx)); assert(C); if (C->getZExtValue() != 0) { // This instruction is already using an immediate. continue; } // Set the immediate value Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); } } // Set the immediate register Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); } break; } } SDNode *Result = SelectCode(N); // Fold operands of selected node const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) { bool IsModified = false; do { std::vector<SDValue> Ops; for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); I != E; ++I) Ops.push_back(*I); IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops); if (IsModified) { Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); } } while (IsModified); } if (Result && Result->isMachineOpcode() && !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) && TII->hasInstrModifiers(Result->getMachineOpcode())) { // Fold FNEG/FABS // TODO: Isel can generate multiple MachineInst, we need to recursively // parse Result bool IsModified = false; do { std::vector<SDValue> Ops; for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); I != E; ++I) Ops.push_back(*I); IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); if (IsModified) { Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); } } while (IsModified); // If node has a single use which is CLAMP_R600, folds it if (Result->hasOneUse() && Result->isMachineOpcode()) { SDNode *PotentialClamp = *Result->use_begin(); if (PotentialClamp->isMachineOpcode() && PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) { unsigned ClampIdx = TII->getOperandIdx(Result->getMachineOpcode(), AMDGPU::OpName::clamp); std::vector<SDValue> Ops; unsigned NumOp = Result->getNumOperands(); for (unsigned i = 0; i < NumOp; ++i) { Ops.push_back(Result->getOperand(i)); } Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32); Result = CurDAG->SelectNodeTo(PotentialClamp, Result->getMachineOpcode(), PotentialClamp->getVTList(), Ops.data(), NumOp); } } } } return Result; }
void ScheduleDAGSDNodes::BuildSchedUnits() { // During scheduling, the NodeId field of SDNode is used to map SDNodes // to their associated SUnits by holding SUnits table indices. A value // of -1 means the SDNode does not yet have an associated SUnit. unsigned NumNodes = 0; for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(), E = DAG->allnodes_end(); NI != E; ++NI) { NI->setNodeId(-1); ++NumNodes; } // Reserve entries in the vector for each of the SUnits we are creating. This // ensure that reallocation of the vector won't happen, so SUnit*'s won't get // invalidated. // FIXME: Multiply by 2 because we may clone nodes during scheduling. // This is a temporary workaround. SUnits.reserve(NumNodes * 2); // Add all nodes in depth first order. SmallVector<SDNode*, 64> Worklist; SmallPtrSet<SDNode*, 64> Visited; Worklist.push_back(DAG->getRoot().getNode()); Visited.insert(DAG->getRoot().getNode()); SmallVector<SUnit*, 8> CallSUnits; while (!Worklist.empty()) { SDNode *NI = Worklist.pop_back_val(); // Add all operands to the worklist unless they've already been added. for (unsigned i = 0, e = NI->getNumOperands(); i != e; ++i) if (Visited.insert(NI->getOperand(i).getNode())) Worklist.push_back(NI->getOperand(i).getNode()); if (isPassiveNode(NI)) // Leaf node, e.g. a TargetImmediate. continue; // If this node has already been processed, stop now. if (NI->getNodeId() != -1) continue; SUnit *NodeSUnit = newSUnit(NI); // See if anything is glued to this node, if so, add them to glued // nodes. Nodes can have at most one glue input and one glue output. Glue // is required to be the last operand and result of a node. // Scan up to find glued preds. SDNode *N = NI; while (N->getNumOperands() && N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) { N = N->getOperand(N->getNumOperands()-1).getNode(); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall()) NodeSUnit->isCall = true; } // Scan down to find any glued succs. N = NI; while (N->getValueType(N->getNumValues()-1) == MVT::Glue) { SDValue GlueVal(N, N->getNumValues()-1); // There are either zero or one users of the Glue result. bool HasGlueUse = false; for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) if (GlueVal.isOperandOf(*UI)) { HasGlueUse = true; assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); N = *UI; if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall()) NodeSUnit->isCall = true; break; } if (!HasGlueUse) break; } if (NodeSUnit->isCall) CallSUnits.push_back(NodeSUnit); // Schedule zero-latency TokenFactor below any nodes that may increase the // schedule height. Otherwise, ancestors of the TokenFactor may appear to // have false stalls. if (NI->getOpcode() == ISD::TokenFactor) NodeSUnit->isScheduleLow = true; // If there are glue operands involved, N is now the bottom-most node // of the sequence of nodes that are glued together. // Update the SUnit. NodeSUnit->setNode(N); assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); // Compute NumRegDefsLeft. This must be done before AddSchedEdges. InitNumRegDefsLeft(NodeSUnit); // Assign the Latency field of NodeSUnit using target-provided information. computeLatency(NodeSUnit); } // Find all call operands. while (!CallSUnits.empty()) { SUnit *SU = CallSUnits.pop_back_val(); for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (SUNode->getOpcode() != ISD::CopyToReg) continue; SDNode *SrcN = SUNode->getOperand(2).getNode(); if (isPassiveNode(SrcN)) continue; // Not scheduled. SUnit *SrcSU = &SUnits[SrcN->getNodeId()]; SrcSU->isCallOp = true; } } }
void ScheduleDAGSDNodes::AddSchedEdges() { const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>(); // Check to see if the scheduler cares about latencies. bool UnitLatencies = forceUnitLatencies(); // Pass 2: add the preds, succs, etc. for (unsigned su = 0, e = SUnits.size(); su != e; ++su) { SUnit *SU = &SUnits[su]; SDNode *MainNode = SU->getNode(); if (MainNode->isMachineOpcode()) { unsigned Opc = MainNode->getMachineOpcode(); const MCInstrDesc &MCID = TII->get(Opc); for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { SU->isTwoAddress = true; break; } } if (MCID.isCommutable()) SU->isCommutable = true; } // Find all predecessors and successors of the group. for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) { if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).getImplicitDefs()) { SU->hasPhysRegClobbers = true; unsigned NumUsed = InstrEmitter::CountResults(N); while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1)) --NumUsed; // Skip over unused values at the end. if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs()) SU->hasPhysRegDefs = true; } for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *OpN = N->getOperand(i).getNode(); if (isPassiveNode(OpN)) continue; // Not scheduled. SUnit *OpSU = &SUnits[OpN->getNodeId()]; assert(OpSU && "Node has no SUnit!"); if (OpSU == SU) continue; // In the same group. EVT OpVT = N->getOperand(i).getValueType(); assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); bool isChain = OpVT == MVT::Other; unsigned PhysReg = 0; int Cost = 1; // Determine if this is a physical register dependency. CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost); assert((PhysReg == 0 || !isChain) && "Chain dependence via physreg data?"); // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler // emits a copy from the physical register to a virtual register unless // it requires a cross class copy (cost < 0). That means we are only // treating "expensive to copy" register dependency as physical register // dependency. This may change in the future though. if (Cost >= 0 && !StressSched) PhysReg = 0; // If this is a ctrl dep, latency is 1. unsigned OpLatency = isChain ? 1 : OpSU->Latency; // Special-case TokenFactor chains as zero-latency. if(isChain && OpN->getOpcode() == ISD::TokenFactor) OpLatency = 0; const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data, OpLatency, PhysReg); if (!isChain && !UnitLatencies) { computeOperandLatency(OpN, N, i, const_cast<SDep &>(dep)); ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep)); } if (!SU->addPred(dep) && !dep.isCtrl() && OpSU->NumRegDefsLeft > 1) { // Multiple register uses are combined in the same SUnit. For example, // we could have a set of glued nodes with all their defs consumed by // another set of glued nodes. Register pressure tracking sees this as // a single use, so to keep pressure balanced we reduce the defs. // // We can't tell (without more book-keeping) if this results from // glued nodes or duplicate operands. As long as we don't reduce // NumRegDefsLeft to zero, we handle the common cases well. --OpSU->NumRegDefsLeft; } } } } }
// Select - Convert the specified operand from a target-independent to a // target-specific node if it hasn't already been changed. SDNode *IA64DAGToDAGISel::Select(SDValue Op) { SDNode *N = Op.getNode(); if (N->isMachineOpcode()) return NULL; // Already selected. DebugLoc dl = Op.getDebugLoc(); switch (N->getOpcode()) { default: break; case IA64ISD::BRCALL: { // XXX: this is also a hack! SDValue Chain = N->getOperand(0); SDValue InFlag; // Null incoming flag value. if(N->getNumOperands()==3) { // we have an incoming chain, callee and flag InFlag = N->getOperand(2); } unsigned CallOpcode; SDValue CallOperand; // if we can call directly, do so if (GlobalAddressSDNode *GASD = dyn_cast<GlobalAddressSDNode>(N->getOperand(1))) { CallOpcode = IA64::BRCALL_IPREL_GA; CallOperand = CurDAG->getTargetGlobalAddress(GASD->getGlobal(), MVT::i64); } else if (isa<ExternalSymbolSDNode>(N->getOperand(1))) { // FIXME: we currently NEED this case for correctness, to avoid // "non-pic code with imm reloc.n against dynamic symbol" errors CallOpcode = IA64::BRCALL_IPREL_ES; CallOperand = N->getOperand(1); } else { // otherwise we need to load the function descriptor, // load the branch target (function)'s entry point and GP, // branch (call) then restore the GP SDValue FnDescriptor = N->getOperand(1); // load the branch target's entry point [mem] and // GP value [mem+8] SDValue targetEntryPoint= SDValue(CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64, MVT::Other, FnDescriptor, CurDAG->getEntryNode()), 0); Chain = targetEntryPoint.getValue(1); SDValue targetGPAddr= SDValue(CurDAG->getTargetNode(IA64::ADDS, dl, MVT::i64, FnDescriptor, CurDAG->getConstant(8, MVT::i64)), 0); Chain = targetGPAddr.getValue(1); SDValue targetGP = SDValue(CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64,MVT::Other, targetGPAddr, CurDAG->getEntryNode()), 0); Chain = targetGP.getValue(1); Chain = CurDAG->getCopyToReg(Chain, dl, IA64::r1, targetGP, InFlag); InFlag = Chain.getValue(1); Chain = CurDAG->getCopyToReg(Chain, dl, IA64::B6, targetEntryPoint, InFlag); // FLAG these? InFlag = Chain.getValue(1); CallOperand = CurDAG->getRegister(IA64::B6, MVT::i64); CallOpcode = IA64::BRCALL_INDIRECT; } // Finally, once everything is setup, emit the call itself if (InFlag.getNode()) Chain = SDValue(CurDAG->getTargetNode(CallOpcode, dl, MVT::Other, MVT::Flag, CallOperand, InFlag), 0); else // there might be no arguments Chain = SDValue(CurDAG->getTargetNode(CallOpcode, dl, MVT::Other, MVT::Flag, CallOperand, Chain), 0); InFlag = Chain.getValue(1); std::vector<SDValue> CallResults; CallResults.push_back(Chain); CallResults.push_back(InFlag); for (unsigned i = 0, e = CallResults.size(); i != e; ++i) ReplaceUses(Op.getValue(i), CallResults[i]); return NULL; } case IA64ISD::GETFD: { SDValue Input = N->getOperand(0); return CurDAG->getTargetNode(IA64::GETFD, dl, MVT::i64, Input); } case ISD::FDIV: case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: return SelectDIV(Op); case ISD::TargetConstantFP: { SDValue Chain = CurDAG->getEntryNode(); // this is a constant, so.. SDValue V; ConstantFPSDNode* N2 = cast<ConstantFPSDNode>(N); if (N2->getValueAPF().isPosZero()) { V = CurDAG->getCopyFromReg(Chain, dl, IA64::F0, MVT::f64); } else if (N2->isExactlyValue(N2->getValueType(0) == MVT::f32 ? APFloat(+1.0f) : APFloat(+1.0))) { V = CurDAG->getCopyFromReg(Chain, dl, IA64::F1, MVT::f64); } else assert(0 && "Unexpected FP constant!"); ReplaceUses(SDValue(N, 0), V); return 0; } case ISD::FrameIndex: { // TODO: reduce creepyness int FI = cast<FrameIndexSDNode>(N)->getIndex(); if (N->hasOneUse()) return CurDAG->SelectNodeTo(N, IA64::MOV, MVT::i64, CurDAG->getTargetFrameIndex(FI, MVT::i64)); else return CurDAG->getTargetNode(IA64::MOV, dl, MVT::i64, CurDAG->getTargetFrameIndex(FI, MVT::i64)); } case ISD::ConstantPool: { // TODO: nuke the constant pool // (ia64 doesn't need one) ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N); Constant *C = CP->getConstVal(); SDValue CPI = CurDAG->getTargetConstantPool(C, MVT::i64, CP->getAlignment()); return CurDAG->getTargetNode(IA64::ADDL_GA, dl, MVT::i64, // ? CurDAG->getRegister(IA64::r1, MVT::i64), CPI); } case ISD::GlobalAddress: { GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal(); SDValue GA = CurDAG->getTargetGlobalAddress(GV, MVT::i64); SDValue Tmp = SDValue(CurDAG->getTargetNode(IA64::ADDL_GA, dl, MVT::i64, CurDAG->getRegister(IA64::r1, MVT::i64), GA), 0); return CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64, MVT::Other, Tmp, CurDAG->getEntryNode()); } /* XXX case ISD::ExternalSymbol: { SDValue EA = CurDAG->getTargetExternalSymbol( cast<ExternalSymbolSDNode>(N)->getSymbol(), MVT::i64); SDValue Tmp = CurDAG->getTargetNode(IA64::ADDL_EA, dl, MVT::i64, CurDAG->getRegister(IA64::r1, MVT::i64), EA); return CurDAG->getTargetNode(IA64::LD8, dl, MVT::i64, Tmp); } */ case ISD::LOAD: { // FIXME: load -1, not 1, for bools? LoadSDNode *LD = cast<LoadSDNode>(N); SDValue Chain = LD->getChain(); SDValue Address = LD->getBasePtr(); MVT TypeBeingLoaded = LD->getMemoryVT(); unsigned Opc; switch (TypeBeingLoaded.getSimpleVT()) { default: #ifndef NDEBUG N->dump(CurDAG); #endif assert(0 && "Cannot load this type!"); case MVT::i1: { // this is a bool Opc = IA64::LD1; // first we load a byte, then compare for != 0 if(N->getValueType(0) == MVT::i1) { // XXX: early exit! return CurDAG->SelectNodeTo(N, IA64::CMPNE, MVT::i1, MVT::Other, SDValue(CurDAG->getTargetNode(Opc, dl, MVT::i64, Address), 0), CurDAG->getRegister(IA64::r0, MVT::i64), Chain); } /* otherwise, we want to load a bool into something bigger: LD1 will do that for us, so we just fall through */ } case MVT::i8: Opc = IA64::LD1; break; case MVT::i16: Opc = IA64::LD2; break; case MVT::i32: Opc = IA64::LD4; break; case MVT::i64: Opc = IA64::LD8; break; case MVT::f32: Opc = IA64::LDF4; break; case MVT::f64: Opc = IA64::LDF8; break; } // TODO: comment this return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), MVT::Other, Address, Chain); } case ISD::STORE: { StoreSDNode *ST = cast<StoreSDNode>(N); SDValue Address = ST->getBasePtr(); SDValue Chain = ST->getChain(); unsigned Opc; if (ISD::isNON_TRUNCStore(N)) { switch (N->getOperand(1).getValueType().getSimpleVT()) { default: assert(0 && "unknown type in store"); case MVT::i1: { // this is a bool Opc = IA64::ST1; // we store either 0 or 1 as a byte // first load zero! SDValue Initial = CurDAG->getCopyFromReg(Chain, dl, IA64::r0, MVT::i64); Chain = Initial.getValue(1); // then load 1 into the same reg iff the predicate to store is 1 SDValue Tmp = ST->getValue(); Tmp = SDValue(CurDAG->getTargetNode(IA64::TPCADDS, dl, MVT::i64, Initial, CurDAG->getTargetConstant(1, MVT::i64), Tmp), 0); return CurDAG->SelectNodeTo(N, Opc, MVT::Other, Address, Tmp, Chain); } case MVT::i64: Opc = IA64::ST8; break; case MVT::f64: Opc = IA64::STF8; break; } } else { // Truncating store switch(ST->getMemoryVT().getSimpleVT()) { default: assert(0 && "unknown type in truncstore"); case MVT::i8: Opc = IA64::ST1; break; case MVT::i16: Opc = IA64::ST2; break; case MVT::i32: Opc = IA64::ST4; break; case MVT::f32: Opc = IA64::STF4; break; } } SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); return CurDAG->SelectNodeTo(N, Opc, MVT::Other, N2, N1, Chain); } case ISD::BRCOND: { SDValue Chain = N->getOperand(0); SDValue CC = N->getOperand(1); MachineBasicBlock *Dest = cast<BasicBlockSDNode>(N->getOperand(2))->getBasicBlock(); //FIXME - we do NOT need long branches all the time return CurDAG->SelectNodeTo(N, IA64::BRLCOND_NOTCALL, MVT::Other, CC, CurDAG->getBasicBlock(Dest), Chain); } case ISD::CALLSEQ_START: case ISD::CALLSEQ_END: { int64_t Amt = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); unsigned Opc = N->getOpcode() == ISD::CALLSEQ_START ? IA64::ADJUSTCALLSTACKDOWN : IA64::ADJUSTCALLSTACKUP; SDValue N0 = N->getOperand(0); return CurDAG->SelectNodeTo(N, Opc, MVT::Other, getI64Imm(Amt), N0); } case ISD::BR: // FIXME: we don't need long branches all the time! SDValue N0 = N->getOperand(0); return CurDAG->SelectNodeTo(N, IA64::BRL_NOTCALL, MVT::Other, N->getOperand(1), N0); } return SelectCode(Op); }
/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. void InstrEmitter:: EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) { unsigned VRBase = 0; if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { // Just use the input register directly! SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second; (void)isNew; // Silence compiler warning. assert(isNew && "Node emitted out of order - early"); return; } // If the node is only used by a CopyToReg and the dest reg is a vreg, use // the CopyToReg'd destination register instead of creating a new vreg. bool MatchReg = true; const TargetRegisterClass *UseRC = NULL; EVT VT = Node->getValueType(ResNo); // Stick to the preferred register classes for legal types. if (TLI->isTypeLegal(VT)) UseRC = TLI->getRegClassFor(VT); if (!IsClone && !IsCloned) for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end(); UI != E; ++UI) { SDNode *User = *UI; bool Match = true; if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2).getNode() == Node && User->getOperand(2).getResNo() == ResNo) { unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(DestReg)) { VRBase = DestReg; Match = false; } else if (DestReg != SrcReg) Match = false; } else { for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { SDValue Op = User->getOperand(i); if (Op.getNode() != Node || Op.getResNo() != ResNo) continue; EVT VT = Node->getValueType(Op.getResNo()); if (VT == MVT::Other || VT == MVT::Glue) continue; Match = false; if (User->isMachineOpcode()) { const MCInstrDesc &II = TII->get(User->getMachineOpcode()); const TargetRegisterClass *RC = 0; if (i+II.getNumDefs() < II.getNumOperands()) RC = TII->getRegClass(II, i+II.getNumDefs(), TRI); if (!UseRC) UseRC = RC; else if (RC) { const TargetRegisterClass *ComRC = TRI->getCommonSubClass(UseRC, RC); // If multiple uses expect disjoint register classes, we emit // copies in AddRegisterOperand. if (ComRC) UseRC = ComRC; } } } } MatchReg &= Match; if (VRBase) break; } const TargetRegisterClass *SrcRC = 0, *DstRC = 0; SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT); // Figure out the register class to create for the destreg. if (VRBase) { DstRC = MRI->getRegClass(VRBase); } else if (UseRC) { assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!"); DstRC = UseRC; } else { DstRC = TLI->getRegClassFor(VT); } // If all uses are reading from the src physical register and copying the // register is either impossible or very expensive, then don't create a copy. if (MatchReg && SrcRC->getCopyCost() < 0) { VRBase = SrcReg; } else { // Create the reg, emit the copy. VRBase = MRI->createVirtualRegister(DstRC); BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg); } SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second; (void)isNew; // Silence compiler warning. assert(isNew && "Node emitted out of order - early"); }
/// ClusterNeighboringLoads - Force nearby loads together by "flagging" them. /// This function finds loads of the same base and different offsets. If the /// offsets are not far apart (target specific), it add MVT::Flag inputs and /// outputs to ensure they are scheduled together and in order. This /// optimization may benefit some targets by improving cache locality. void ScheduleDAGSDNodes::ClusterNeighboringLoads() { SmallPtrSet<SDNode*, 16> Visited; SmallVector<int64_t, 4> Offsets; DenseMap<long long, SDNode*> O2SMap; // Map from offset to SDNode. for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(), E = DAG->allnodes_end(); NI != E; ++NI) { SDNode *Node = &*NI; if (!Node || !Node->isMachineOpcode()) continue; unsigned Opc = Node->getMachineOpcode(); const TargetInstrDesc &TID = TII->get(Opc); if (!TID.mayLoad()) continue; SDNode *Chain = 0; unsigned NumOps = Node->getNumOperands(); if (Node->getOperand(NumOps-1).getValueType() == MVT::Other) Chain = Node->getOperand(NumOps-1).getNode(); if (!Chain) continue; // Look for other loads of the same chain. Find loads that are loading from // the same base pointer and different offsets. Visited.clear(); Offsets.clear(); O2SMap.clear(); bool Cluster = false; SDNode *Base = Node; int64_t BaseOffset; for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) { SDNode *User = *I; if (User == Node || !Visited.insert(User)) continue; int64_t Offset1, Offset2; if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) || Offset1 == Offset2) // FIXME: Should be ok if they addresses are identical. But earlier // optimizations really should have eliminated one of the loads. continue; if (O2SMap.insert(std::make_pair(Offset1, Base)).second) Offsets.push_back(Offset1); O2SMap.insert(std::make_pair(Offset2, User)); Offsets.push_back(Offset2); if (Offset2 < Offset1) { Base = User; BaseOffset = Offset2; } else { BaseOffset = Offset1; } Cluster = true; } if (!Cluster) continue; // Sort them in increasing order. std::sort(Offsets.begin(), Offsets.end()); // Check if the loads are close enough. SmallVector<SDNode*, 4> Loads; unsigned NumLoads = 0; int64_t BaseOff = Offsets[0]; SDNode *BaseLoad = O2SMap[BaseOff]; Loads.push_back(BaseLoad); for (unsigned i = 1, e = Offsets.size(); i != e; ++i) { int64_t Offset = Offsets[i]; SDNode *Load = O2SMap[Offset]; if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset, NumLoads)) break; // Stop right here. Ignore loads that are further away. Loads.push_back(Load); ++NumLoads; } if (NumLoads == 0) continue; // Cluster loads by adding MVT::Flag outputs and inputs. This also // ensure they are scheduled in order of increasing addresses. SDNode *Lead = Loads[0]; AddFlags(Lead, SDValue(0,0), true, DAG); SDValue InFlag = SDValue(Lead, Lead->getNumValues()-1); for (unsigned i = 1, e = Loads.size(); i != e; ++i) { bool OutFlag = i < e-1; SDNode *Load = Loads[i]; AddFlags(Load, InFlag, OutFlag, DAG); if (OutFlag) InFlag = SDValue(Load, Load->getNumValues()-1); ++LoadsClustered; } } }
/// Returns single number reflecting benefit of scheduling SU /// in the current cycle. signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) { // Initial trivial priority. signed ResCount = 1; // Do not waste time on a node that is already scheduled. if (SU->isScheduled) return ResCount; // Forced priority is high. if (SU->isScheduleHigh) ResCount += PriorityOne; // Adaptable scheduling // A small, but very parallel // region, where reg pressure is an issue. if (HorizontalVerticalBalance > RegPressureThreshold) { // Critical path first ResCount += (SU->getHeight() * ScaleTwo); // If resources are available for it, multiply the // chance of scheduling. if (isResourceAvailable(SU)) ResCount <<= FactorOne; // Consider change to reg pressure from scheduling // this SU. ResCount -= (regPressureDelta(SU,true) * ScaleOne); } // Default heuristic, greeady and // critical path driven. else { // Critical path first. ResCount += (SU->getHeight() * ScaleTwo); // Now see how many instructions is blocked by this SU. ResCount += (NumNodesSolelyBlocking[SU->NodeNum] * ScaleTwo); // If resources are available for it, multiply the // chance of scheduling. if (isResourceAvailable(SU)) ResCount <<= FactorOne; ResCount -= (regPressureDelta(SU) * ScaleTwo); } // These are platform specific things. // Will need to go into the back end // and accessed from here via a hook. for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) { if (N->isMachineOpcode()) { const MCInstrDesc &TID = TII->get(N->getMachineOpcode()); if (TID.isCall()) ResCount += (PriorityThree + (ScaleThree*N->getNumValues())); } else switch (N->getOpcode()) { default: break; case ISD::TokenFactor: case ISD::CopyFromReg: case ISD::CopyToReg: ResCount += PriorityFive; break; case ISD::INLINEASM: ResCount += PriorityFour; break; } } return ResCount; }
/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an /// implicit physical register output. void ScheduleDAGSDNodes::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) { unsigned VRBase = 0; if (TargetRegisterInfo::isVirtualRegister(SrcReg)) { // Just use the input register directly! SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second; isNew = isNew; // Silence compiler warning. assert(isNew && "Node emitted out of order - early"); return; } // If the node is only used by a CopyToReg and the dest reg is a vreg, use // the CopyToReg'd destination register instead of creating a new vreg. bool MatchReg = true; const TargetRegisterClass *UseRC = NULL; if (!IsClone && !IsCloned) for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end(); UI != E; ++UI) { SDNode *User = *UI; bool Match = true; if (User->getOpcode() == ISD::CopyToReg && User->getOperand(2).getNode() == Node && User->getOperand(2).getResNo() == ResNo) { unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(DestReg)) { VRBase = DestReg; Match = false; } else if (DestReg != SrcReg) Match = false; } else { for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { SDValue Op = User->getOperand(i); if (Op.getNode() != Node || Op.getResNo() != ResNo) continue; MVT VT = Node->getValueType(Op.getResNo()); if (VT == MVT::Other || VT == MVT::Flag) continue; Match = false; if (User->isMachineOpcode()) { const TargetInstrDesc &II = TII->get(User->getMachineOpcode()); const TargetRegisterClass *RC = getInstrOperandRegClass(TRI, II, i+II.getNumDefs()); if (!UseRC) UseRC = RC; else if (RC) { if (UseRC->hasSuperClass(RC)) UseRC = RC; else assert((UseRC == RC || RC->hasSuperClass(UseRC)) && "Multiple uses expecting different register classes!"); } } } } MatchReg &= Match; if (VRBase) break; } MVT VT = Node->getValueType(ResNo); const TargetRegisterClass *SrcRC = 0, *DstRC = 0; SrcRC = TRI->getPhysicalRegisterRegClass(SrcReg, VT); // Figure out the register class to create for the destreg. if (VRBase) { DstRC = MRI.getRegClass(VRBase); } else if (UseRC) { assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!"); DstRC = UseRC; } else { DstRC = TLI->getRegClassFor(VT); } // If all uses are reading from the src physical register and copying the // register is either impossible or very expensive, then don't create a copy. if (MatchReg && SrcRC->getCopyCost() < 0) { VRBase = SrcReg; } else { // Create the reg, emit the copy. VRBase = MRI.createVirtualRegister(DstRC); bool Emitted = TII->copyRegToReg(*BB, InsertPos, VRBase, SrcReg, DstRC, SrcRC); // If the target didn't handle the copy with different register // classes and the destination is a subset of the source, // try a normal same-RC copy. if (!Emitted && DstRC->hasSuperClass(SrcRC)) Emitted = TII->copyRegToReg(*BB, InsertPos, VRBase, SrcReg, SrcRC, SrcRC); assert(Emitted && "Unable to issue a copy instruction!\n"); } SDValue Op(Node, ResNo); if (IsClone) VRBaseMap.erase(Op); bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second; isNew = isNew; // Silence compiler warning. assert(isNew && "Node emitted out of order - early"); }
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { return NULL; // Already selected. } switch (Opc) { default: break; case ISD::BUILD_VECTOR: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { break; } // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) }; bool IsRegSeq = true; for (unsigned i = 0; i < N->getNumOperands(); i++) { if (dyn_cast<RegisterSDNode>(N->getOperand(i))) { IsRegSeq = false; break; } RegSeqArgs[2 * i + 1] = N->getOperand(i); } if (!IsRegSeq) break; return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs, 2 * N->getNumOperands() + 1); } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); // XXX: Custom immediate lowering not implemented yet. Instead we use // pseudo instructions defined in SIInstructions.td if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { break; } const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); uint64_t ImmValue = 0; unsigned ImmReg = AMDGPU::ALU_LITERAL_X; if (N->getOpcode() == ISD::ConstantFP) { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::f64); ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N); APFloat Value = C->getValueAPF(); float FloatValue = Value.convertToFloat(); if (FloatValue == 0.0) { ImmReg = AMDGPU::ZERO; } else if (FloatValue == 0.5) { ImmReg = AMDGPU::HALF; } else if (FloatValue == 1.0) { ImmReg = AMDGPU::ONE; } else { ImmValue = Value.bitcastToAPInt().getZExtValue(); } } else { // XXX: 64-bit Immediates not supported yet assert(N->getValueType(0) != MVT::i64); ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); if (C->getZExtValue() == 0) { ImmReg = AMDGPU::ZERO; } else if (C->getZExtValue() == 1) { ImmReg = AMDGPU::ONE_INT; } else { ImmValue = C->getZExtValue(); } } for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); Use != SDNode::use_end(); Use = Next) { Next = llvm::next(Use); std::vector<SDValue> Ops; for (unsigned i = 0; i < Use->getNumOperands(); ++i) { Ops.push_back(Use->getOperand(i)); } if (!Use->isMachineOpcode()) { if (ImmReg == AMDGPU::ALU_LITERAL_X) { // We can only use literal constants (e.g. AMDGPU::ZERO, // AMDGPU::ONE, etc) in machine opcodes. continue; } } else { if (!TII->isALUInstr(Use->getMachineOpcode()) || (TII->get(Use->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)) { continue; } int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM); assert(ImmIdx != -1); // subtract one from ImmIdx, because the DST operand is usually index // 0 for MachineInstrs, but we have no DST in the Ops vector. ImmIdx--; // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one // immediate operand, but this is not supported yet. if (ImmReg == AMDGPU::ALU_LITERAL_X) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx)); assert(C); if (C->getZExtValue() != 0) { // This instruction is already using an immediate. continue; } // Set the immediate value Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); } } // Set the immediate register Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); } break; } } SDNode *Result = SelectCode(N); // Fold operands of selected node const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo()); if (Result && Result->isMachineOpcode() && !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR) && TII->isALUInstr(Result->getMachineOpcode())) { // Fold FNEG/FABS/CONST_ADDRESS // TODO: Isel can generate multiple MachineInst, we need to recursively // parse Result bool IsModified = false; do { std::vector<SDValue> Ops; for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); I != E; ++I) Ops.push_back(*I); IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); if (IsModified) { Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size()); } } while (IsModified); // If node has a single use which is CLAMP_R600, folds it if (Result->hasOneUse() && Result->isMachineOpcode()) { SDNode *PotentialClamp = *Result->use_begin(); if (PotentialClamp->isMachineOpcode() && PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) { unsigned ClampIdx = TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP); std::vector<SDValue> Ops; unsigned NumOp = Result->getNumOperands(); for (unsigned i = 0; i < NumOp; ++i) { Ops.push_back(Result->getOperand(i)); } Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32); Result = CurDAG->SelectNodeTo(PotentialClamp, Result->getMachineOpcode(), PotentialClamp->getVTList(), Ops.data(), NumOp); } } } } return Result; }
void ScheduleDAGSDNodes::AddSchedEdges() { const TargetSubtarget &ST = TM.getSubtarget<TargetSubtarget>(); // Check to see if the scheduler cares about latencies. bool UnitLatencies = ForceUnitLatencies(); // Pass 2: add the preds, succs, etc. for (unsigned su = 0, e = SUnits.size(); su != e; ++su) { SUnit *SU = &SUnits[su]; SDNode *MainNode = SU->getNode(); if (MainNode->isMachineOpcode()) { unsigned Opc = MainNode->getMachineOpcode(); const TargetInstrDesc &TID = TII->get(Opc); for (unsigned i = 0; i != TID.getNumOperands(); ++i) { if (TID.getOperandConstraint(i, TOI::TIED_TO) != -1) { SU->isTwoAddress = true; break; } } if (TID.isCommutable()) SU->isCommutable = true; } // Find all predecessors and successors of the group. for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) { if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).getImplicitDefs()) { SU->hasPhysRegClobbers = true; unsigned NumUsed = InstrEmitter::CountResults(N); while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1)) --NumUsed; // Skip over unused values at the end. if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs()) SU->hasPhysRegDefs = true; } for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *OpN = N->getOperand(i).getNode(); if (isPassiveNode(OpN)) continue; // Not scheduled. SUnit *OpSU = &SUnits[OpN->getNodeId()]; assert(OpSU && "Node has no SUnit!"); if (OpSU == SU) continue; // In the same group. EVT OpVT = N->getOperand(i).getValueType(); assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); bool isChain = OpVT == MVT::Other; unsigned PhysReg = 0; int Cost = 1; // Determine if this is a physical register dependency. CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost); assert((PhysReg == 0 || !isChain) && "Chain dependence via physreg data?"); // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler // emits a copy from the physical register to a virtual register unless // it requires a cross class copy (cost < 0). That means we are only // treating "expensive to copy" register dependency as physical register // dependency. This may change in the future though. if (Cost >= 0) PhysReg = 0; // If this is a ctrl dep, latency is 1. unsigned OpLatency = isChain ? 1 : OpSU->Latency; const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data, OpLatency, PhysReg); if (!isChain && !UnitLatencies) { ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep)); ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep)); } SU->addPred(dep); } } } }