bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; }
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SRem || I.getOpcode() == Instruction::URem) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { if (promotedOpIsNSW(cast<Instruction>(I))) Inst->setHasNoSignedWrap(); if (promotedOpIsNUW(cast<Instruction>(I))) Inst->setHasNoUnsignedWrap(); if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) Inst->setIsExact(ExactOp->isExact()); } TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; }
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; bool Changed = false; Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && Ty->getScalarSizeInBits() <= 32) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewDiv = UndefValue::get(VT); for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); if (!NewElt) NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { NewDiv = expandDivRem32(Builder, I, Num, Den); } if (NewDiv) { I.replaceAllUsesWith(NewDiv); I.eraseFromParent(); Changed = true; } } return Changed; }
// Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); if (!FPMath) return false; const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); float ULP = FPOp->getFPAccuracy(); if (ULP < 2.5f) return false; FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); if (ST->hasFP32Denormals() && !UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); Function *Decl = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { if (!shouldKeepFDivF32(Num, UnsafeDiv)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); FDiv.eraseFromParent(); } return true; }