Skip to content

Commit 992685c

Browse files
committed
fold add with fneg to sub
1 parent 1b1ec73 commit 992685c

File tree

7 files changed

+262
-158
lines changed

7 files changed

+262
-158
lines changed

clang/include/clang/Basic/BuiltinsNVPTX.td

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -476,30 +476,6 @@ def __nvvm_add_rz_d : NVPTXBuiltin<"double(double, double)">;
476476
def __nvvm_add_rm_d : NVPTXBuiltin<"double(double, double)">;
477477
def __nvvm_add_rp_d : NVPTXBuiltin<"double(double, double)">;
478478

479-
// Sub
480-
481-
def __nvvm_sub_rn_ftz_f : NVPTXBuiltin<"float(float, float)">;
482-
def __nvvm_sub_rn_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
483-
def __nvvm_sub_rn_f : NVPTXBuiltin<"float(float, float)">;
484-
def __nvvm_sub_rn_sat_f : NVPTXBuiltin<"float(float, float)">;
485-
def __nvvm_sub_rz_ftz_f : NVPTXBuiltin<"float(float, float)">;
486-
def __nvvm_sub_rz_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
487-
def __nvvm_sub_rz_f : NVPTXBuiltin<"float(float, float)">;
488-
def __nvvm_sub_rz_sat_f : NVPTXBuiltin<"float(float, float)">;
489-
def __nvvm_sub_rm_ftz_f : NVPTXBuiltin<"float(float, float)">;
490-
def __nvvm_sub_rm_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
491-
def __nvvm_sub_rm_f : NVPTXBuiltin<"float(float, float)">;
492-
def __nvvm_sub_rm_sat_f : NVPTXBuiltin<"float(float, float)">;
493-
def __nvvm_sub_rp_ftz_f : NVPTXBuiltin<"float(float, float)">;
494-
def __nvvm_sub_rp_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
495-
def __nvvm_sub_rp_f : NVPTXBuiltin<"float(float, float)">;
496-
def __nvvm_sub_rp_sat_f : NVPTXBuiltin<"float(float, float)">;
497-
498-
def __nvvm_sub_rn_d : NVPTXBuiltin<"double(double, double)">;
499-
def __nvvm_sub_rz_d : NVPTXBuiltin<"double(double, double)">;
500-
def __nvvm_sub_rm_d : NVPTXBuiltin<"double(double, double)">;
501-
def __nvvm_sub_rp_d : NVPTXBuiltin<"double(double, double)">;
502-
503479
// Convert
504480

505481
def __nvvm_d2f_rn_ftz : NVPTXBuiltin<"float(double)">;

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 2 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1520,8 +1520,8 @@ __device__ void nvvm_min_max_sm86() {
15201520
// CHECK: ret void
15211521
}
15221522

1523-
// CHECK-LABEL: nvvm_add_sub_fma_f32_sat
1524-
__device__ void nvvm_add_sub_fma_f32_sat() {
1523+
// CHECK-LABEL: nvvm_add_fma_f32_sat
1524+
__device__ void nvvm_add_fma_f32_sat() {
15251525
// CHECK: call float @llvm.nvvm.add.rn.sat.f
15261526
__nvvm_add_rn_sat_f(1.0f, 2.0f);
15271527
// CHECK: call float @llvm.nvvm.add.rn.ftz.sat.f
@@ -1539,23 +1539,6 @@ __device__ void nvvm_add_sub_fma_f32_sat() {
15391539
// CHECK: call float @llvm.nvvm.add.rp.ftz.sat.f
15401540
__nvvm_add_rp_ftz_sat_f(1.0f, 2.0f);
15411541

1542-
// CHECK: call float @llvm.nvvm.sub.rn.sat.f
1543-
__nvvm_sub_rn_sat_f(1.0f, 2.0f);
1544-
// CHECK: call float @llvm.nvvm.sub.rn.ftz.sat.f
1545-
__nvvm_sub_rn_ftz_sat_f(1.0f, 2.0f);
1546-
// CHECK: call float @llvm.nvvm.sub.rz.sat.f
1547-
__nvvm_sub_rz_sat_f(1.0f, 2.0f);
1548-
// CHECK: call float @llvm.nvvm.sub.rz.ftz.sat.f
1549-
__nvvm_sub_rz_ftz_sat_f(1.0f, 2.0f);
1550-
// CHECK: call float @llvm.nvvm.sub.rm.sat.f
1551-
__nvvm_sub_rm_sat_f(1.0f, 2.0f);
1552-
// CHECK: call float @llvm.nvvm.sub.rm.ftz.sat.f
1553-
__nvvm_sub_rm_ftz_sat_f(1.0f, 2.0f);
1554-
// CHECK: call float @llvm.nvvm.sub.rp.sat.f
1555-
__nvvm_sub_rp_sat_f(1.0f, 2.0f);
1556-
// CHECK: call float @llvm.nvvm.sub.rp.ftz.sat.f
1557-
__nvvm_sub_rp_ftz_sat_f(1.0f, 2.0f);
1558-
15591542
// CHECK: call float @llvm.nvvm.fma.rn.sat.f
15601543
__nvvm_fma_rn_sat_f(1.0f, 2.0f, 3.0f);
15611544
// CHECK: call float @llvm.nvvm.fma.rn.ftz.sat.f
@@ -1575,39 +1558,3 @@ __device__ void nvvm_add_sub_fma_f32_sat() {
15751558

15761559
// CHECK: ret void
15771560
}
1578-
1579-
// CHECK-LABEL: nvvm_sub_f32
1580-
__device__ void nvvm_sub_f32() {
1581-
// CHECK: call float @llvm.nvvm.sub.rn.f
1582-
__nvvm_sub_rn_f(1.0f, 2.0f);
1583-
// CHECK: call float @llvm.nvvm.sub.rn.ftz.f
1584-
__nvvm_sub_rn_ftz_f(1.0f, 2.0f);
1585-
// CHECK: call float @llvm.nvvm.sub.rz.f
1586-
__nvvm_sub_rz_f(1.0f, 2.0f);
1587-
// CHECK: call float @llvm.nvvm.sub.rz.ftz.f
1588-
__nvvm_sub_rz_ftz_f(1.0f, 2.0f);
1589-
// CHECK: call float @llvm.nvvm.sub.rm.f
1590-
__nvvm_sub_rm_f(1.0f, 2.0f);
1591-
// CHECK: call float @llvm.nvvm.sub.rm.ftz.f
1592-
__nvvm_sub_rm_ftz_f(1.0f, 2.0f);
1593-
// CHECK: call float @llvm.nvvm.sub.rp.f
1594-
__nvvm_sub_rp_f(1.0f, 2.0f);
1595-
// CHECK: call float @llvm.nvvm.sub.rp.ftz.f
1596-
__nvvm_sub_rp_ftz_f(1.0f, 2.0f);
1597-
1598-
// CHECK: ret void
1599-
}
1600-
1601-
// CHECK-LABEL: nvvm_sub_f64
1602-
__device__ void nvvm_sub_f64() {
1603-
// CHECK: call double @llvm.nvvm.sub.rn.d
1604-
__nvvm_sub_rn_d(1.0f, 2.0f);
1605-
// CHECK: call double @llvm.nvvm.sub.rz.d
1606-
__nvvm_sub_rz_d(1.0f, 2.0f);
1607-
// CHECK: call double @llvm.nvvm.sub.rm.d
1608-
__nvvm_sub_rm_d(1.0f, 2.0f);
1609-
// CHECK: call double @llvm.nvvm.sub.rp.d
1610-
__nvvm_sub_rp_d(1.0f, 2.0f);
1611-
1612-
// CHECK: ret void
1613-
}

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1581,20 +1581,6 @@ let TargetPrefix = "nvvm" in {
15811581
DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>;
15821582
} // rnd
15831583
}
1584-
1585-
//
1586-
// Sub
1587-
//
1588-
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
1589-
foreach ftz = ["", "_ftz"] in {
1590-
foreach sat = ["", "_sat"] in {
1591-
def int_nvvm_sub # rnd # ftz # sat # _f : NVVMBuiltin,
1592-
PureIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>;
1593-
} // sat
1594-
} // ftz
1595-
def int_nvvm_sub # rnd # _d : NVVMBuiltin,
1596-
PureIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>;
1597-
} // rnd
15981584

15991585
//
16001586
// Dot Product

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 161 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -866,14 +866,28 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
866866
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
867867

868868
// We have some custom DAG combine patterns for these nodes
869-
setTargetDAGCombine(
870-
{ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT,
871-
ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
872-
ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
873-
ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
874-
ISD::SREM, ISD::UREM, ISD::VSELECT,
875-
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
876-
ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
869+
setTargetDAGCombine({ISD::ADD,
870+
ISD::AND,
871+
ISD::EXTRACT_VECTOR_ELT,
872+
ISD::FADD,
873+
ISD::FMAXNUM,
874+
ISD::FMINNUM,
875+
ISD::FMAXIMUM,
876+
ISD::FMINIMUM,
877+
ISD::FMAXIMUMNUM,
878+
ISD::FMINIMUMNUM,
879+
ISD::MUL,
880+
ISD::SHL,
881+
ISD::SREM,
882+
ISD::UREM,
883+
ISD::VSELECT,
884+
ISD::BUILD_VECTOR,
885+
ISD::ADDRSPACECAST,
886+
ISD::LOAD,
887+
ISD::STORE,
888+
ISD::ZERO_EXTEND,
889+
ISD::SIGN_EXTEND,
890+
ISD::INTRINSIC_WO_CHAIN});
877891

878892
// setcc for f16x2 and bf16x2 needs special handling to prevent
879893
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6504,6 +6518,143 @@ static SDValue sinkProxyReg(SDValue R, SDValue Chain,
65046518
}
65056519
}
65066520

6521+
static std::optional<unsigned> getSubF32Opc(Intrinsic::ID AddIntrinsicID) {
6522+
switch (AddIntrinsicID) {
6523+
default:
6524+
break;
6525+
case Intrinsic::nvvm_add_rn_f:
6526+
return NVPTXISD::SUB_RN_F;
6527+
case Intrinsic::nvvm_add_rn_sat_f:
6528+
return NVPTXISD::SUB_RN_SAT_F;
6529+
case Intrinsic::nvvm_add_rn_ftz_f:
6530+
return NVPTXISD::SUB_RN_FTZ_F;
6531+
case Intrinsic::nvvm_add_rn_ftz_sat_f:
6532+
return NVPTXISD::SUB_RN_FTZ_SAT_F;
6533+
case Intrinsic::nvvm_add_rz_f:
6534+
return NVPTXISD::SUB_RZ_F;
6535+
case Intrinsic::nvvm_add_rz_sat_f:
6536+
return NVPTXISD::SUB_RZ_SAT_F;
6537+
case Intrinsic::nvvm_add_rz_ftz_f:
6538+
return NVPTXISD::SUB_RZ_FTZ_F;
6539+
case Intrinsic::nvvm_add_rz_ftz_sat_f:
6540+
return NVPTXISD::SUB_RZ_FTZ_SAT_F;
6541+
case Intrinsic::nvvm_add_rm_f:
6542+
return NVPTXISD::SUB_RM_F;
6543+
case Intrinsic::nvvm_add_rm_sat_f:
6544+
return NVPTXISD::SUB_RM_SAT_F;
6545+
case Intrinsic::nvvm_add_rm_ftz_f:
6546+
return NVPTXISD::SUB_RM_FTZ_F;
6547+
case Intrinsic::nvvm_add_rm_ftz_sat_f:
6548+
return NVPTXISD::SUB_RM_FTZ_SAT_F;
6549+
case Intrinsic::nvvm_add_rp_f:
6550+
return NVPTXISD::SUB_RP_F;
6551+
case Intrinsic::nvvm_add_rp_sat_f:
6552+
return NVPTXISD::SUB_RP_SAT_F;
6553+
case Intrinsic::nvvm_add_rp_ftz_f:
6554+
return NVPTXISD::SUB_RP_FTZ_F;
6555+
case Intrinsic::nvvm_add_rp_ftz_sat_f:
6556+
return NVPTXISD::SUB_RP_FTZ_SAT_F;
6557+
}
6558+
llvm_unreachable("Invalid add intrinsic ID");
6559+
return std::nullopt;
6560+
}
6561+
6562+
static std::optional<unsigned> getSubF64Opc(Intrinsic::ID AddIntrinsicID) {
6563+
switch (AddIntrinsicID) {
6564+
default:
6565+
return std::nullopt;
6566+
case Intrinsic::nvvm_add_rn_d:
6567+
return NVPTXISD::SUB_RN_D;
6568+
case Intrinsic::nvvm_add_rz_d:
6569+
return NVPTXISD::SUB_RZ_D;
6570+
case Intrinsic::nvvm_add_rm_d:
6571+
return NVPTXISD::SUB_RM_D;
6572+
case Intrinsic::nvvm_add_rp_d:
6573+
return NVPTXISD::SUB_RP_D;
6574+
}
6575+
llvm_unreachable("Invalid add intrinsic ID");
6576+
return std::nullopt;
6577+
}
6578+
6579+
static SDValue combineF32AddWithNeg(SDNode *N, SelectionDAG &DAG,
6580+
Intrinsic::ID AddIntrinsicID,
6581+
unsigned PTXVersion, unsigned SmVersion) {
6582+
SDValue Op2 = N->getOperand(2);
6583+
6584+
if (Op2.getOpcode() != ISD::FNEG)
6585+
return SDValue();
6586+
6587+
// If PTX > 8.6 and SM >= 100, when Op1 is a fpextend from f16 or bf16, don't
6588+
// fold this pattern as this will be folded to a mixed precision instruction
6589+
// later on.
6590+
SDValue Op1 = N->getOperand(1);
6591+
if (PTXVersion >= 86 && SmVersion >= 100 &&
6592+
Op1.getOpcode() == ISD::FP_EXTEND) {
6593+
if (Op1.getOperand(0).getSimpleValueType() == MVT::f16 ||
6594+
Op1.getOperand(0).getSimpleValueType() == MVT::bf16)
6595+
return SDValue();
6596+
}
6597+
6598+
std::optional<unsigned> Opc = getSubF32Opc(AddIntrinsicID);
6599+
if (!Opc)
6600+
return SDValue();
6601+
6602+
SDLoc DL(N);
6603+
return DAG.getNode(*Opc, DL, N->getValueType(0), N->getOperand(1),
6604+
Op2.getOperand(0));
6605+
}
6606+
6607+
static SDValue combineF64AddWithNeg(SDNode *N, SelectionDAG &DAG,
6608+
Intrinsic::ID AddIntrinsicID) {
6609+
SDValue Op2 = N->getOperand(2);
6610+
6611+
if (Op2.getOpcode() != ISD::FNEG)
6612+
return SDValue();
6613+
6614+
std::optional<unsigned> Opc = getSubF64Opc(AddIntrinsicID);
6615+
if (!Opc)
6616+
return SDValue();
6617+
6618+
SDLoc DL(N);
6619+
return DAG.getNode(*Opc, DL, N->getValueType(0), N->getOperand(1),
6620+
Op2.getOperand(0));
6621+
}
6622+
6623+
static SDValue combineIntrinsicWOChain(SDNode *N,
6624+
TargetLowering::DAGCombinerInfo &DCI,
6625+
const NVPTXSubtarget &STI) {
6626+
unsigned IntID = N->getConstantOperandVal(0);
6627+
6628+
switch (IntID) {
6629+
default:
6630+
break;
6631+
case Intrinsic::nvvm_add_rn_f:
6632+
case Intrinsic::nvvm_add_rn_sat_f:
6633+
case Intrinsic::nvvm_add_rn_ftz_f:
6634+
case Intrinsic::nvvm_add_rn_ftz_sat_f:
6635+
case Intrinsic::nvvm_add_rz_f:
6636+
case Intrinsic::nvvm_add_rz_sat_f:
6637+
case Intrinsic::nvvm_add_rz_ftz_f:
6638+
case Intrinsic::nvvm_add_rz_ftz_sat_f:
6639+
case Intrinsic::nvvm_add_rm_f:
6640+
case Intrinsic::nvvm_add_rm_sat_f:
6641+
case Intrinsic::nvvm_add_rm_ftz_f:
6642+
case Intrinsic::nvvm_add_rm_ftz_sat_f:
6643+
case Intrinsic::nvvm_add_rp_f:
6644+
case Intrinsic::nvvm_add_rp_sat_f:
6645+
case Intrinsic::nvvm_add_rp_ftz_f:
6646+
case Intrinsic::nvvm_add_rp_ftz_sat_f:
6647+
return combineF32AddWithNeg(N, DCI.DAG, IntID, STI.getPTXVersion(),
6648+
STI.getSmVersion());
6649+
case Intrinsic::nvvm_add_rn_d:
6650+
case Intrinsic::nvvm_add_rz_d:
6651+
case Intrinsic::nvvm_add_rm_d:
6652+
case Intrinsic::nvvm_add_rp_d:
6653+
return combineF64AddWithNeg(N, DCI.DAG, IntID);
6654+
}
6655+
return SDValue();
6656+
}
6657+
65076658
static SDValue combineProxyReg(SDNode *N,
65086659
TargetLowering::DAGCombinerInfo &DCI) {
65096660

@@ -6570,6 +6721,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
65706721
return combineSTORE(N, DCI, STI);
65716722
case ISD::VSELECT:
65726723
return PerformVSELECTCombine(N, DCI);
6724+
case ISD::INTRINSIC_WO_CHAIN:
6725+
return combineIntrinsicWOChain(N, DCI, STI);
65736726
}
65746727
return SDValue();
65756728
}

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1910,27 +1910,25 @@ let Predicates = [hasSM<100>, hasPTX<86>, doNoF32FTZ] in {
19101910
// Sub
19111911
//
19121912

1913-
def INT_NVVM_SUB_RN_FTZ_F : F_MATH_2<"sub.rn.ftz.f32", B32, B32, B32, int_nvvm_sub_rn_ftz_f>;
1914-
def INT_NVVM_SUB_RN_SAT_FTZ_F : F_MATH_2<"sub.rn.sat.ftz.f32", B32, B32, B32, int_nvvm_sub_rn_ftz_sat_f>;
1915-
def INT_NVVM_SUB_RN_F : F_MATH_2<"sub.rn.f32", B32, B32, B32, int_nvvm_sub_rn_f>;
1916-
def INT_NVVM_SUB_RN_SAT_F : F_MATH_2<"sub.rn.sat.f32", B32, B32, B32, int_nvvm_sub_rn_sat_f>;
1917-
def INT_NVVM_SUB_RZ_FTZ_F : F_MATH_2<"sub.rz.ftz.f32", B32, B32, B32, int_nvvm_sub_rz_ftz_f>;
1918-
def INT_NVVM_SUB_RZ_SAT_FTZ_F : F_MATH_2<"sub.rz.sat.ftz.f32", B32, B32, B32, int_nvvm_sub_rz_ftz_sat_f>;
1919-
def INT_NVVM_SUB_RZ_F : F_MATH_2<"sub.rz.f32", B32, B32, B32, int_nvvm_sub_rz_f>;
1920-
def INT_NVVM_SUB_RZ_SAT_F : F_MATH_2<"sub.rz.sat.f32", B32, B32, B32, int_nvvm_sub_rz_sat_f>;
1921-
def INT_NVVM_SUB_RM_FTZ_F : F_MATH_2<"sub.rm.ftz.f32", B32, B32, B32, int_nvvm_sub_rm_ftz_f>;
1922-
def INT_NVVM_SUB_RM_SAT_FTZ_F : F_MATH_2<"sub.rm.sat.ftz.f32", B32, B32, B32, int_nvvm_sub_rm_ftz_sat_f>;
1923-
def INT_NVVM_SUB_RM_F : F_MATH_2<"sub.rm.f32", B32, B32, B32, int_nvvm_sub_rm_f>;
1924-
def INT_NVVM_SUB_RM_SAT_F : F_MATH_2<"sub.rm.sat.f32", B32, B32, B32, int_nvvm_sub_rm_sat_f>;
1925-
def INT_NVVM_SUB_RP_FTZ_F : F_MATH_2<"sub.rp.ftz.f32", B32, B32, B32, int_nvvm_sub_rp_ftz_f>;
1926-
def INT_NVVM_SUB_RP_SAT_FTZ_F : F_MATH_2<"sub.rp.sat.ftz.f32", B32, B32, B32, int_nvvm_sub_rp_ftz_sat_f>;
1927-
def INT_NVVM_SUB_RP_F : F_MATH_2<"sub.rp.f32", B32, B32, B32, int_nvvm_sub_rp_f>;
1928-
def INT_NVVM_SUB_RP_SAT_F : F_MATH_2<"sub.rp.sat.f32", B32, B32, B32, int_nvvm_sub_rp_sat_f>;
1929-
1930-
def INT_NVVM_SUB_RN_D : F_MATH_2<"sub.rn.f64", B64, B64, B64, int_nvvm_sub_rn_d>;
1931-
def INT_NVVM_SUB_RZ_D : F_MATH_2<"sub.rz.f64", B64, B64, B64, int_nvvm_sub_rz_d>;
1932-
def INT_NVVM_SUB_RM_D : F_MATH_2<"sub.rm.f64", B64, B64, B64, int_nvvm_sub_rm_d>;
1933-
def INT_NVVM_SUB_RP_D : F_MATH_2<"sub.rp.f64", B64, B64, B64, int_nvvm_sub_rp_d>;
1913+
foreach rnd = ["_RN", "_RZ", "_RM", "_RP"] in {
1914+
foreach ftz = ["", "_FTZ"] in {
1915+
foreach sat = ["", "_SAT"] in {
1916+
def SUB_ # rnd # ftz # sat # _F :
1917+
SDNode<"NVPTXISD::SUB" # rnd # ftz # sat # "_F", SDTFPBinOp>;
1918+
def INT_NVVM_SUB # rnd # ftz # sat # _F :
1919+
BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b),
1920+
!tolower(!subst("_", ".", "sub" # rnd # ftz # sat # "_f32")),
1921+
[(set f32:$dst,
1922+
(!cast<SDNode>("SUB_" # rnd # ftz # sat # "_F") f32:$a, f32:$b))]>;
1923+
}
1924+
}
1925+
1926+
def SUB_ # rnd # _D : SDNode<"NVPTXISD::SUB" # rnd # "_D", SDTFPBinOp>;
1927+
def INT_NVVM_SUB # rnd # _D : BasicNVPTXInst<(outs B64:$dst), (ins B64:$a, B64:$b),
1928+
!tolower(!subst("_", ".", "sub" # rnd # "_f64")),
1929+
[(set f64:$dst,
1930+
(!cast<SDNode>("SUB_" # rnd # "_D") f64:$a, f64:$b))]>;
1931+
}
19341932

19351933
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
19361934
foreach sat = ["", "_sat"] in {
@@ -1939,9 +1937,9 @@ foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
19391937
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B32:$b),
19401938
!subst("_", ".", "sub" # rnd # sat # "_f32_" # type),
19411939
[(set f32:$dst,
1942-
(!cast<Intrinsic>("int_nvvm_sub" # rnd # sat # "_f")
1940+
(!cast<Intrinsic>("int_nvvm_add" # rnd # sat # "_f")
19431941
(f32 (fpextend type:$a)),
1944-
f32:$b))]>,
1942+
(f32 (fneg f32:$b))))]>,
19451943
Requires<[hasSM<100>, hasPTX<86>]>;
19461944
}
19471945
}
@@ -6236,3 +6234,4 @@ foreach sp = [0, 1] in {
62366234
}
62376235
}
62386236
}
6237+

0 commit comments

Comments
 (0)