// Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); if (!FPMath) return false; const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); float ULP = FPOp->getFPAccuracy(); if (ULP < 2.5f) return false; FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); if (ST->hasFP32Denormals() && !UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); Function *Decl = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { if (!shouldKeepFDivF32(Num, UnsafeDiv)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); FDiv.eraseFromParent(); } return true; }