void IndVarSimplify::EliminateIVRemainders() { // Look for SRem and URem users. for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { IVStrideUse &UI = *I; BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser()); if (!Rem) continue; bool isSigned = Rem->getOpcode() == Instruction::SRem; if (!isSigned && Rem->getOpcode() != Instruction::URem) continue; // We're only interested in the case where we know something about // the numerator. if (UI.getOperandValToReplace() != Rem->getOperand(0)) continue; // Get the SCEVs for the ICmp operands. const SCEV *S = SE->getSCEV(Rem->getOperand(0)); const SCEV *X = SE->getSCEV(Rem->getOperand(1)); // Simplify unnecessary loops away. const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); S = SE->getSCEVAtScope(S, ICmpLoop); X = SE->getSCEVAtScope(X, ICmpLoop); // i % n --> i if i is in [0,n). if ((!isSigned || SE->isKnownNonNegative(S)) && SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, S, X)) Rem->replaceAllUsesWith(Rem->getOperand(0)); else { // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). const SCEV *LessOne = SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); if ((!isSigned || SE->isKnownNonNegative(LessOne)) && SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, LessOne, X)) { ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, Rem->getOperand(0), Rem->getOperand(1), "tmp"); SelectInst *Sel = SelectInst::Create(ICmp, ConstantInt::get(Rem->getType(), 0), Rem->getOperand(0), "tmp", Rem); Rem->replaceAllUsesWith(Sel); } else continue; } // Inform IVUsers about the new users. if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) IU->AddUsersIfInteresting(I); DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); DeadInsts.push_back(Rem); } }
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; }
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SRem || I.getOpcode() == Instruction::URem) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { if (promotedOpIsNSW(cast<Instruction>(I))) Inst->setHasNoSignedWrap(); if (promotedOpIsNUW(cast<Instruction>(I))) Inst->setHasNoUnsignedWrap(); if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) Inst->setIsExact(ExactOp->isExact()); } TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; }
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; bool Changed = false; Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && Ty->getScalarSizeInBits() <= 32) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewDiv = UndefValue::get(VT); for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); if (!NewElt) NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { NewDiv = expandDivRem32(Builder, I, Num, Den); } if (NewDiv) { I.replaceAllUsesWith(NewDiv); I.eraseFromParent(); Changed = true; } } return Changed; }
/// HandleFloatingPointIV - If the loop has floating induction variable /// then insert corresponding integer induction variable if possible. /// For example, /// for(double i = 0; i < 10000; ++i) /// bar(i) /// is converted into /// for(int i = 0; i < 10000; ++i) /// bar((double)i); /// void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); unsigned BackEdge = IncomingEdge^1; // Check incoming value. ConstantFP *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); int64_t InitValue; if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) return; // Check IV increment. Reject this PN if increment operation is not // an add or increment value can not be represented by an integer. BinaryOperator *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return; // If this is not an add of the PHI with a constantfp, or if the constant fp // is not an integer, bail out. ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); int64_t IncValue; if (IncValueVal == 0 || Incr->getOperand(0) != PN || !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) return; // Check Incr uses. One user is PN and the other user is an exit condition // used by the conditional terminator. Value::use_iterator IncrUse = Incr->use_begin(); Instruction *U1 = cast<Instruction>(IncrUse++); if (IncrUse == Incr->use_end()) return; Instruction *U2 = cast<Instruction>(IncrUse++); if (IncrUse != Incr->use_end()) return; // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't // only used by a branch, we can't transform it. FCmpInst *Compare = dyn_cast<FCmpInst>(U1); if (!Compare) Compare = dyn_cast<FCmpInst>(U2); if (Compare == 0 || !Compare->hasOneUse() || !isa<BranchInst>(Compare->use_back())) return; BranchInst *TheBr = cast<BranchInst>(Compare->use_back()); // We need to verify that the branch actually controls the iteration count // of the loop. If not, the new IV can overflow and no one will notice. // The branch block must be in the loop and one of the successors must be out // of the loop. assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); if (!L->contains(TheBr->getParent()) || (L->contains(TheBr->getSuccessor(0)) && L->contains(TheBr->getSuccessor(1)))) return; // If it isn't a comparison with an integer-as-fp (the exit value), we can't // transform it. ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); int64_t ExitValue; if (ExitValueVal == 0 || !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) return; // Find new predicate for integer comparison. CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; switch (Compare->getPredicate()) { default: return; // Unknown comparison. case CmpInst::FCMP_OEQ: case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; case CmpInst::FCMP_ONE: case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; case CmpInst::FCMP_OGT: case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; case CmpInst::FCMP_OGE: case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; case CmpInst::FCMP_OLT: case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; case CmpInst::FCMP_OLE: case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; } // We convert the floating point induction variable to a signed i32 value if // we can. This is only safe if the comparison will not overflow in a way // that won't be trapped by the integer equivalent operations. Check for this // now. // TODO: We could use i64 if it is native and the range requires it. // The start/stride/exit values must all fit in signed i32. if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) return; // If not actually striding (add x, 0.0), avoid touching the code. if (IncValue == 0) return; // Positive and negative strides have different safety conditions. if (IncValue > 0) { // If we have a positive stride, we require the init to be less than the // exit value and an equality or less than comparison. if (InitValue >= ExitValue || NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE) return; uint32_t Range = uint32_t(ExitValue-InitValue); if (NewPred == CmpInst::ICMP_SLE) { // Normalize SLE -> SLT, check for infinite loop. if (++Range == 0) return; // Range overflows. } unsigned Leftover = Range % uint32_t(IncValue); // If this is an equality comparison, we require that the strided value // exactly land on the exit value, otherwise the IV condition will wrap // around and do things the fp IV wouldn't. if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && Leftover != 0) return; // If the stride would wrap around the i32 before exiting, we can't // transform the IV. if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) return; } else { // If we have a negative stride, we require the init to be greater than the // exit value and an equality or greater than comparison. if (InitValue >= ExitValue || NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE) return; uint32_t Range = uint32_t(InitValue-ExitValue); if (NewPred == CmpInst::ICMP_SGE) { // Normalize SGE -> SGT, check for infinite loop. if (++Range == 0) return; // Range overflows. } unsigned Leftover = Range % uint32_t(-IncValue); // If this is an equality comparison, we require that the strided value // exactly land on the exit value, otherwise the IV condition will wrap // around and do things the fp IV wouldn't. if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && Leftover != 0) return; // If the stride would wrap around the i32 before exiting, we can't // transform the IV. if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) return; } const IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); // Insert new integer induction variable. PHINode *NewPHI = PHINode::Create(Int32Ty, PN->getName()+".int", PN); NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), PN->getIncomingBlock(IncomingEdge)); Value *NewAdd = BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), Incr->getName()+".int", Incr); NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, ConstantInt::get(Int32Ty, ExitValue), Compare->getName()); // In the following deletions, PN may become dead and may be deleted. // Use a WeakVH to observe whether this happens. WeakVH WeakPH = PN; // Delete the old floating point exit comparison. The branch starts using the // new comparison. NewCompare->takeName(Compare); Compare->replaceAllUsesWith(NewCompare); RecursivelyDeleteTriviallyDeadInstructions(Compare); // Delete the old floating point increment. Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); RecursivelyDeleteTriviallyDeadInstructions(Incr); // If the FP induction variable still has uses, this is because something else // in the loop uses its value. In order to canonicalize the induction // variable, we chose to eliminate the IV and rewrite it in terms of an // int->fp cast. // // We give preference to sitofp over uitofp because it is faster on most // platforms. if (WeakPH) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", PN->getParent()->getFirstNonPHI()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN); } // Add a new IVUsers entry for the newly-created integer PHI. IU->AddUsersIfInteresting(NewPHI); }
// Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); if (!FPMath) return false; const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); float ULP = FPOp->getFPAccuracy(); if (ULP < 2.5f) return false; FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); if (ST->hasFP32Denormals() && !UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); Function *Decl = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { if (!shouldKeepFDivF32(Num, UnsafeDiv)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); FDiv.eraseFromParent(); } return true; }
/// HandleFloatingPointIV - If the loop has floating induction variable /// then insert corresponding integer induction variable if possible. /// For example, /// for(double i = 0; i < 10000; ++i) /// bar(i) /// is converted into /// for(int i = 0; i < 10000; ++i) /// bar((double)i); /// void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH) { unsigned IncomingEdge = L->contains(PH->getIncomingBlock(0)); unsigned BackEdge = IncomingEdge^1; // Check incoming value. ConstantFP *InitValue = dyn_cast<ConstantFP>(PH->getIncomingValue(IncomingEdge)); if (!InitValue) return; uint64_t newInitValue = Type::getInt32Ty(PH->getContext())->getPrimitiveSizeInBits(); if (!convertToInt(InitValue->getValueAPF(), &newInitValue)) return; // Check IV increment. Reject this PH if increment operation is not // an add or increment value can not be represented by an integer. BinaryOperator *Incr = dyn_cast<BinaryOperator>(PH->getIncomingValue(BackEdge)); if (!Incr) return; if (Incr->getOpcode() != Instruction::FAdd) return; ConstantFP *IncrValue = NULL; unsigned IncrVIndex = 1; if (Incr->getOperand(1) == PH) IncrVIndex = 0; IncrValue = dyn_cast<ConstantFP>(Incr->getOperand(IncrVIndex)); if (!IncrValue) return; uint64_t newIncrValue = Type::getInt32Ty(PH->getContext())->getPrimitiveSizeInBits(); if (!convertToInt(IncrValue->getValueAPF(), &newIncrValue)) return; // Check Incr uses. One user is PH and the other users is exit condition used // by the conditional terminator. Value::use_iterator IncrUse = Incr->use_begin(); Instruction *U1 = cast<Instruction>(IncrUse++); if (IncrUse == Incr->use_end()) return; Instruction *U2 = cast<Instruction>(IncrUse++); if (IncrUse != Incr->use_end()) return; // Find exit condition. FCmpInst *EC = dyn_cast<FCmpInst>(U1); if (!EC) EC = dyn_cast<FCmpInst>(U2); if (!EC) return; if (BranchInst *BI = dyn_cast<BranchInst>(EC->getParent()->getTerminator())) { if (!BI->isConditional()) return; if (BI->getCondition() != EC) return; } // Find exit value. If exit value can not be represented as an integer then // do not handle this floating point PH. ConstantFP *EV = NULL; unsigned EVIndex = 1; if (EC->getOperand(1) == Incr) EVIndex = 0; EV = dyn_cast<ConstantFP>(EC->getOperand(EVIndex)); if (!EV) return; uint64_t intEV = Type::getInt32Ty(PH->getContext())->getPrimitiveSizeInBits(); if (!convertToInt(EV->getValueAPF(), &intEV)) return; // Find new predicate for integer comparison. CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; switch (EC->getPredicate()) { case CmpInst::FCMP_OEQ: case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; case CmpInst::FCMP_OGT: case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_UGT; break; case CmpInst::FCMP_OGE: case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_UGE; break; case CmpInst::FCMP_OLT: case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_ULT; break; case CmpInst::FCMP_OLE: case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_ULE; break; default: break; } if (NewPred == CmpInst::BAD_ICMP_PREDICATE) return; // Insert new integer induction variable. PHINode *NewPHI = PHINode::Create(Type::getInt32Ty(PH->getContext()), PH->getName()+".int", PH); NewPHI->addIncoming(ConstantInt::get(Type::getInt32Ty(PH->getContext()), newInitValue), PH->getIncomingBlock(IncomingEdge)); Value *NewAdd = BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Type::getInt32Ty(PH->getContext()), newIncrValue), Incr->getName()+".int", Incr); NewPHI->addIncoming(NewAdd, PH->getIncomingBlock(BackEdge)); // The back edge is edge 1 of newPHI, whatever it may have been in the // original PHI. ConstantInt *NewEV = ConstantInt::get(Type::getInt32Ty(PH->getContext()), intEV); Value *LHS = (EVIndex == 1 ? NewPHI->getIncomingValue(1) : NewEV); Value *RHS = (EVIndex == 1 ? NewEV : NewPHI->getIncomingValue(1)); ICmpInst *NewEC = new ICmpInst(EC->getParent()->getTerminator(), NewPred, LHS, RHS, EC->getName()); // In the following deletions, PH may become dead and may be deleted. // Use a WeakVH to observe whether this happens. WeakVH WeakPH = PH; // Delete old, floating point, exit comparison instruction. NewEC->takeName(EC); EC->replaceAllUsesWith(NewEC); RecursivelyDeleteTriviallyDeadInstructions(EC); // Delete old, floating point, increment instruction. Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); RecursivelyDeleteTriviallyDeadInstructions(Incr); // Replace floating induction variable, if it isn't already deleted. // Give SIToFPInst preference over UIToFPInst because it is faster on // platforms that are widely used. if (WeakPH && !PH->use_empty()) { if (useSIToFPInst(*InitValue, *EV, newInitValue, intEV)) { SIToFPInst *Conv = new SIToFPInst(NewPHI, PH->getType(), "indvar.conv", PH->getParent()->getFirstNonPHI()); PH->replaceAllUsesWith(Conv); } else { UIToFPInst *Conv = new UIToFPInst(NewPHI, PH->getType(), "indvar.conv", PH->getParent()->getFirstNonPHI()); PH->replaceAllUsesWith(Conv); } RecursivelyDeleteTriviallyDeadInstructions(PH); } // Add a new IVUsers entry for the newly-created integer PHI. IU->AddUsersIfInteresting(NewPHI); }