bool HexagonGenExtract::convert(Instruction *In) { using namespace PatternMatch; Value *BF = nullptr; ConstantInt *CSL = nullptr, *CSR = nullptr, *CM = nullptr; BasicBlock *BB = In->getParent(); LLVMContext &Ctx = BB->getContext(); bool LogicalSR; // (and (shl (lshr x, #sr), #sl), #m) LogicalSR = true; bool Match = match(In, m_And(m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)), m_ConstantInt(CSL)), m_ConstantInt(CM))); if (!Match) { // (and (shl (ashr x, #sr), #sl), #m) LogicalSR = false; Match = match(In, m_And(m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)), m_ConstantInt(CSL)), m_ConstantInt(CM))); } if (!Match) { // (and (shl x, #sl), #m) LogicalSR = true; CSR = ConstantInt::get(Type::getInt32Ty(Ctx), 0); Match = match(In, m_And(m_Shl(m_Value(BF), m_ConstantInt(CSL)), m_ConstantInt(CM))); if (Match && NoSR0) return false; } if (!Match) { // (and (lshr x, #sr), #m) LogicalSR = true; CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0); Match = match(In, m_And(m_LShr(m_Value(BF), m_ConstantInt(CSR)), m_ConstantInt(CM))); } if (!Match) { // (and (ashr x, #sr), #m) LogicalSR = false; CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0); Match = match(In, m_And(m_AShr(m_Value(BF), m_ConstantInt(CSR)), m_ConstantInt(CM))); } if (!Match) { CM = nullptr; // (shl (lshr x, #sr), #sl) LogicalSR = true; Match = match(In, m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)), m_ConstantInt(CSL))); } if (!Match) { CM = nullptr; // (shl (ashr x, #sr), #sl) LogicalSR = false; Match = match(In, m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)), m_ConstantInt(CSL))); } if (!Match) return false; Type *Ty = BF->getType(); if (!Ty->isIntegerTy()) return false; unsigned BW = Ty->getPrimitiveSizeInBits(); if (BW != 32 && BW != 64) return false; uint32_t SR = CSR->getZExtValue(); uint32_t SL = CSL->getZExtValue(); if (!CM) { // If there was no and, and the shift left did not remove all potential // sign bits created by the shift right, then extractu cannot reproduce // this value. if (!LogicalSR && (SR > SL)) return false; APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL); CM = ConstantInt::get(Ctx, A); } // CM is the shifted-left mask. Shift it back right to remove the zero // bits on least-significant positions. APInt M = CM->getValue().lshr(SL); uint32_t T = M.countTrailingOnes(); // During the shifts some of the bits will be lost. Calculate how many // of the original value will remain after shift right and then left. uint32_t U = BW - std::max(SL, SR); // The width of the extracted field is the minimum of the original bits // that remain after the shifts and the number of contiguous 1s in the mask. uint32_t W = std::min(U, T); if (W == 0) return false; // Check if the extracted bits are contained within the mask that it is // and-ed with. The extract operation will copy these bits, and so the // mask cannot any holes in it that would clear any of the bits of the // extracted field. if (!LogicalSR) { // If the shift right was arithmetic, it could have included some 1 bits. // It is still ok to generate extract, but only if the mask eliminates // those bits (i.e. M does not have any bits set beyond U). APInt C = APInt::getHighBitsSet(BW, BW-U); if (M.intersects(C) || !M.isMask(W)) return false; } else { // Check if M starts with a contiguous sequence of W times 1 bits. Get // the low U bits of M (which eliminates the 0 bits shifted in on the // left), and check if the result is APInt's "mask": if (!M.getLoBits(U).isMask(W)) return false; } IRBuilder<> IRB(In); Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Module *Mod = BB->getParent()->getParent(); Value *ExtF = Intrinsic::getDeclaration(Mod, IntId); Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); if (SL != 0) NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); In->replaceAllUsesWith(NewIn); return true; }
static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, std::vector<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace, const SIInstrInfo *TII, const SIRegisterInfo &TRI, MachineRegisterInfo &MRI) { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); // FIXME: Fold operands with subregs. if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || UseOp.isImplicit())) { return; } bool FoldingImm = OpToFold.isImm(); APInt Imm; if (FoldingImm) { unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? MRI.getRegClass(UseReg) : TRI.getPhysRegClass(UseReg); Imm = APInt(64, OpToFold.getImm()); const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); const TargetRegisterClass *FoldRC = TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); // Split 64-bit constants into 32-bits for folding. if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { if (UseRC->getSize() != 8) return; if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } } // In order to fold immediates into copies, we need to change the // copy to a MOV. if (UseMI->getOpcode() == AMDGPU::COPY) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) : TRI.getPhysRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) return; UseMI->setDesc(TII->get(MovOp)); CopiesToReplace.push_back(UseMI); } } // Special case for REG_SEQUENCE: We can't fold literals into // REG_SEQUENCE instructions, so we have to fold them into the // uses of REG_SEQUENCE. if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); for (MachineRegisterInfo::use_iterator RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) continue; foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, CopiesToReplace, TII, TRI, MRI); } return; } const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. if (UseDesc.isVariadic() || UseDesc.OpInfo[UseOpIdx].RegClass == -1) return; if (FoldingImm) { MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); return; } tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunites. The shrink operands pass // already does this. return; }
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (!isSafeToFold(MI.getOpcode())) continue; unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm(); // FIXME: We could also be folding things like FrameIndexes and // TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; // FIXME: Fold operands with subregs. if (OpToFold.isReg() && (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || OpToFold.getSubReg())) continue; std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); // FIXME: Fold operands with subregs. if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || UseOp.isImplicit())) { continue; } APInt Imm; if (FoldingImm) { unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? MRI.getRegClass(UseReg) : TRI.getRegClass(UseReg); Imm = APInt(64, OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg()) { if (UseRC->getSize() != 8) continue; if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } } // In order to fold immediates into copies, we need to change the // copy to a MOV. if (UseMI->getOpcode() == AMDGPU::COPY) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) : TRI.getRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) continue; UseMI->setDesc(TII->get(MovOp)); } } const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. if (UseDesc.isVariadic() || UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) continue; if (FoldingImm) { MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); continue; } tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunites. The shrink operands pass // already does this. } for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); Fold.OpToFold->setIsKill(false); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); } } } } return false; }