bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; }
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SRem || I.getOpcode() == Instruction::URem) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { if (promotedOpIsNSW(cast<Instruction>(I))) Inst->setHasNoSignedWrap(); if (promotedOpIsNUW(cast<Instruction>(I))) Inst->setHasNoUnsignedWrap(); if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) Inst->setIsExact(ExactOp->isExact()); } TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; }
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; bool Changed = false; Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && Ty->getScalarSizeInBits() <= 32) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewDiv = UndefValue::get(VT); for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); if (!NewElt) NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { NewDiv = expandDivRem32(Builder, I, Num, Den); } if (NewDiv) { I.replaceAllUsesWith(NewDiv); I.eraseFromParent(); Changed = true; } } return Changed; }
/// updateLoopIterationSpace -- Update loop's iteration space if loop /// body is executed for certain IV range only. For example, /// /// for (i = 0; i < N; ++i) { /// if ( i > A && i < B) { /// ... /// } /// } /// is transformed to iterators from A to B, if A > 0 and B < N. /// bool LoopIndexSplit::updateLoopIterationSpace() { SplitCondition = NULL; if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ) return false; BasicBlock *Latch = L->getLoopLatch(); BasicBlock *Header = L->getHeader(); BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator()); if (!BR) return false; if (!isa<BranchInst>(Latch->getTerminator())) return false; if (BR->isUnconditional()) return false; BinaryOperator *AND = dyn_cast<BinaryOperator>(BR->getCondition()); if (!AND) return false; if (AND->getOpcode() != Instruction::And) return false; ICmpInst *Op0 = dyn_cast<ICmpInst>(AND->getOperand(0)); ICmpInst *Op1 = dyn_cast<ICmpInst>(AND->getOperand(1)); if (!Op0 || !Op1) return false; IVBasedValues.insert(AND); IVBasedValues.insert(Op0); IVBasedValues.insert(Op1); if (!cleanBlock(Header)) return false; BasicBlock *ExitingBlock = ExitCondition->getParent(); if (!cleanBlock(ExitingBlock)) return false; // If the merge point for BR is not loop latch then skip this loop. if (BR->getSuccessor(0) != Latch) { DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0)); assert (DF0 != DF->end() && "Unable to find dominance frontier"); if (!DF0->second.count(Latch)) return false; } if (BR->getSuccessor(1) != Latch) { DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1)); assert (DF1 != DF->end() && "Unable to find dominance frontier"); if (!DF1->second.count(Latch)) return false; } // Verify that loop exiting block has only two predecessor, where one pred // is split condition block. The other predecessor will become exiting block's // dominator after CFG is updated. TODO : Handle CFG's where exiting block has // more then two predecessors. This requires extra work in updating dominator // information. BasicBlock *ExitingBBPred = NULL; for (pred_iterator PI = pred_begin(ExitingBlock), PE = pred_end(ExitingBlock); PI != PE; ++PI) { BasicBlock *BB = *PI; if (Header == BB) continue; if (ExitingBBPred) return false; else ExitingBBPred = BB; } if (!restrictLoopBound(*Op0)) return false; if (!restrictLoopBound(*Op1)) return false; // Update CFG. if (BR->getSuccessor(0) == ExitingBlock) BR->setUnconditionalDest(BR->getSuccessor(1)); else BR->setUnconditionalDest(BR->getSuccessor(0)); AND->eraseFromParent(); if (Op0->use_empty()) Op0->eraseFromParent(); if (Op1->use_empty()) Op1->eraseFromParent(); // Update domiantor info. Now, ExitingBlock has only one predecessor, // ExitingBBPred, and it is ExitingBlock's immediate domiantor. DT->changeImmediateDominator(ExitingBlock, ExitingBBPred); BasicBlock *ExitBlock = ExitingBlock->getTerminator()->getSuccessor(1); if (L->contains(ExitBlock)) ExitBlock = ExitingBlock->getTerminator()->getSuccessor(0); // If ExitingBlock is a member of the loop basic blocks' DF list then // replace ExitingBlock with header and exit block in the DF list DominanceFrontier::iterator ExitingBlockDF = DF->find(ExitingBlock); for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) { BasicBlock *BB = *I; if (BB == Header || BB == ExitingBlock) continue; DominanceFrontier::iterator BBDF = DF->find(BB); DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin(); DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end(); while (DomSetI != DomSetE) { DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI; ++DomSetI; BasicBlock *DFBB = *CurrentItr; if (DFBB == ExitingBlock) { BBDF->second.erase(DFBB); for (DominanceFrontier::DomSetType::iterator EBI = ExitingBlockDF->second.begin(), EBE = ExitingBlockDF->second.end(); EBI != EBE; ++EBI) BBDF->second.insert(*EBI); } } } NumRestrictBounds++; return true; }
// Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); if (!FPMath) return false; const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); float ULP = FPOp->getFPAccuracy(); if (ULP < 2.5f) return false; FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); if (ST->hasFP32Denormals() && !UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); Function *Decl = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { if (!shouldKeepFDivF32(Num, UnsafeDiv)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); FDiv.eraseFromParent(); } return true; }