summaryrefslogtreecommitdiffstats
path: root/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
diff options
context:
space:
mode:
authorpatrick <patrick@openbsd.org>2018-04-06 14:26:03 +0000
committerpatrick <patrick@openbsd.org>2018-04-06 14:26:03 +0000
commitbdabc2f19ffb9e20600dad6e8a300842a7bda50e (patch)
treec50e7b2e5449b074651bb82a58517a8ebc4a8cf7 /gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
parentPrint a 'p' flag for file descriptors that were opened after pledge(2). (diff)
downloadwireguard-openbsd-bdabc2f19ffb9e20600dad6e8a300842a7bda50e.tar.xz
wireguard-openbsd-bdabc2f19ffb9e20600dad6e8a300842a7bda50e.zip
Import LLVM 6.0.1 release including clang, lld and lldb.
"where is the kaboom?" deraadt@
Diffstat (limited to 'gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r--gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp616
1 files changed, 534 insertions, 82 deletions
diff --git a/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index b3a3c73f6df..f0e8b11a3d9 100644
--- a/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,6 +51,9 @@
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
@@ -82,11 +85,8 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -114,6 +114,8 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
+
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
@@ -140,6 +142,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+ // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@@ -226,6 +231,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UREM, MVT::i64, Expand);
}
+ if (Subtarget.hasP9Vector()) {
+ setOperationAction(ISD::ABS, MVT::v4i32, Legal);
+ setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+ setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+ }
+
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
@@ -283,14 +294,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::FROUND, MVT::f32, Legal);
}
- // PowerPC does not have BSWAP
+ // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
+ // to speed up scalar BSWAP64.
// CTPOP or CTTZ were introduced in P8/P9 respectivelly
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
- setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
if (Subtarget.isISA3_0()) {
+ setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
} else {
+ setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
}
@@ -773,6 +786,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
}
+
+ if (Subtarget.hasP9Altivec()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ }
}
if (Subtarget.hasQPX()) {
@@ -1131,7 +1149,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
case PPCISD::VPERM: return "PPCISD::VPERM";
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
- case PPCISD::XXINSERT: return "PPCISD::XXINSERT";
+ case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE";
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
@@ -1139,6 +1157,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
+ case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
+ case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
@@ -2413,8 +2433,8 @@ static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
SDValue Ops[] = { GA, Reg };
return DAG.getMemIntrinsicNode(
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
- MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
- false, 0);
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
+ MachineMemOperand::MOLoad);
}
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
@@ -2470,7 +2490,6 @@ SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
switch (getTargetMachine().getCodeModel()) {
- case CodeModel::Default:
case CodeModel::Small:
case CodeModel::Medium:
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
@@ -2488,7 +2507,6 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
switch (getTargetMachine().getCodeModel()) {
- case CodeModel::Default:
case CodeModel::Small:
case CodeModel::Medium:
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
@@ -2560,7 +2578,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool is64bit = Subtarget.isPPC64();
- const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
PICLevel::Level picLevel = M->getPICLevel();
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
@@ -3529,7 +3547,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
unsigned &QFPR_idx = FPR_idx;
SmallVector<SDValue, 8> MemOps;
- Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+ Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
SDValue ArgVal;
@@ -3614,6 +3632,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store;
@@ -3648,6 +3667,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
break;
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Addr = FIN;
if (j) {
@@ -3684,6 +3704,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
// types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
@@ -3729,6 +3750,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
// since otherwise we never run out of FPRs before running out
// of GPRs.
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+ FuncInfo->addLiveInAttr(VReg, Flags);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::f32) {
@@ -3969,7 +3991,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
SmallVector<SDValue, 8> MemOps;
unsigned nAltivecParamsAtEnd = 0;
- Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+ Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
SDValue ArgVal;
@@ -4251,13 +4273,25 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
static bool isFunctionGlobalAddress(SDValue Callee);
static bool
-resideInSameSection(const Function *Caller, SDValue Callee,
+callsShareTOCBase(const Function *Caller, SDValue Callee,
const TargetMachine &TM) {
// If !G, Callee can be an external symbol.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G)
return false;
+ // The medium and large code models are expected to provide a sufficiently
+ // large TOC to provide all data addressing needs of a module with a
+ // single TOC. Since each module will be addressed with a single TOC then we
+ // only need to check that caller and callee don't cross dso boundaries.
+ if (CodeModel::Medium == TM.getCodeModel() ||
+ CodeModel::Large == TM.getCodeModel())
+ return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal());
+
+ // Otherwise we need to ensure callee and caller are in the same section,
+ // since the linker may allocate multiple TOCs, and we don't know which
+ // sections will belong to the same TOC base.
+
const GlobalValue *GV = G->getGlobal();
if (!GV->isStrongDefinitionForLinker())
return false;
@@ -4335,12 +4369,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
}
static bool
-hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
- if (CS->arg_size() != CallerFn->arg_size())
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
+ if (CS.arg_size() != CallerFn->arg_size())
return false;
- ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
- ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+ ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
+ ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
@@ -4363,11 +4397,30 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
return true;
}
+// Returns true if TCO is possible between the callers and callees
+// calling conventions.
+static bool
+areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
+ CallingConv::ID CalleeCC) {
+ // Tail calls are possible with fastcc and ccc.
+ auto isTailCallableCC = [] (CallingConv::ID CC){
+ return CC == CallingConv::C || CC == CallingConv::Fast;
+ };
+ if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
+ return false;
+
+ // We can safely tail call both fastcc and ccc callees from a c calling
+ // convention caller. If the caller is fastcc, we may have less stack space
+ // than a non-fastcc caller with the same signature so disable tail-calls in
+ // that case.
+ return CallerCC == CallingConv::C || CallerCC == CalleeCC;
+}
+
bool
PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
SDValue Callee,
CallingConv::ID CalleeCC,
- ImmutableCallSite *CS,
+ ImmutableCallSite CS,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -4379,15 +4432,9 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// Variadic argument functions are not supported.
if (isVarArg) return false;
- MachineFunction &MF = DAG.getMachineFunction();
- CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
-
- // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
- // the same calling convention
- if (CallerCC != CalleeCC) return false;
-
- // SCO support C calling convention
- if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+ auto &Caller = DAG.getMachineFunction().getFunction();
+ // Check that the calling conventions are compatible for tco.
+ if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
return false;
// Caller contains any byval parameter is not supported.
@@ -4397,20 +4444,37 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// Callee contains any byval parameter is not supported, too.
// Note: This is a quick work around, because in some cases, e.g.
// caller's stack size > callee's stack size, we are still able to apply
- // sibling call optimization. See: https://reviews.llvm.org/D23441#513574
+ // sibling call optimization. For example, gcc is able to do SCO for caller1
+ // in the following example, but not for caller2.
+ // struct test {
+ // long int a;
+ // char ary[56];
+ // } gTest;
+ // __attribute__((noinline)) int callee(struct test v, struct test *b) {
+ // b->a = v.a;
+ // return 0;
+ // }
+ // void caller1(struct test a, struct test c, struct test *b) {
+ // callee(gTest, b); }
+ // void caller2(struct test *b) { callee(gTest, b); }
if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
return false;
+ // If callee and caller use different calling conventions, we cannot pass
+ // parameters on stack since offsets for the parameter area may be different.
+ if (Caller.getCallingConv() != CalleeCC &&
+ needStackSlotPassParameters(Subtarget, Outs))
+ return false;
+
// No TCO/SCO on indirect call because Caller have to restore its TOC
if (!isFunctionGlobalAddress(Callee) &&
!isa<ExternalSymbolSDNode>(Callee))
return false;
- // Check if Callee resides in the same section, because for now, PPC64 SVR4
- // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
- // section.
+ // If the caller and callee potentially have different TOC bases then we
+ // cannot tail call since we need to restore the TOC pointer after the call.
// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
- if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine()))
+ if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
return false;
// TCO allows altering callee ABI, so we don't have to check further.
@@ -4422,7 +4486,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
// If callee use the same argument list that caller is using, then we can
// apply SCO on this case. If it is not, then we need to check if callee needs
// stack for passing arguments.
- if (!hasSameArgumentList(MF.getFunction(), CS) &&
+ if (!hasSameArgumentList(&Caller, CS) &&
needStackSlotPassParameters(Subtarget, Outs)) {
return false;
}
@@ -4447,7 +4511,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
return false;
MachineFunction &MF = DAG.getMachineFunction();
- CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+ CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
// Functions containing by val parameters are not supported.
for (unsigned i = 0; i != Ins.size(); i++) {
@@ -4676,7 +4740,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
bool isPatchPoint, bool hasNest,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
- ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
+ ImmutableCallSite CS, const PPCSubtarget &Subtarget) {
bool isPPC64 = Subtarget.isPPC64();
bool isSVR4ABI = Subtarget.isSVR4ABI();
bool isELFv2ABI = Subtarget.isELFv2ABI();
@@ -4699,7 +4763,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
// we're building with the leopard linker or later, which automatically
// synthesizes these stubs.
const TargetMachine &TM = DAG.getTarget();
- const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
const GlobalValue *GV = nullptr;
if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
GV = G->getGlobal();
@@ -4787,7 +4851,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
MachineMemOperand::MOInvariant)
: MachineMemOperand::MONone;
- MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
+ MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
/* Alignment = */ 8, MMOFlags);
@@ -4917,7 +4981,7 @@ SDValue PPCTargetLowering::FinishCall(
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
- SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
+ SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
std::vector<EVT> NodeTys;
SmallVector<SDValue, 8> Ops;
unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
@@ -4992,7 +5056,7 @@ SDValue PPCTargetLowering::FinishCall(
// any other variadic arguments).
Ops.insert(std::next(Ops.begin()), AddTOC);
} else if (CallOpc == PPCISD::CALL &&
- !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) {
+ !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) {
// Otherwise insert NOP for non-local calls.
CallOpc = PPCISD::CALL_NOP;
}
@@ -5025,10 +5089,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
bool isPatchPoint = CLI.IsPatchPoint;
- ImmutableCallSite *CS = CLI.CS;
+ ImmutableCallSite CS = CLI.CS;
if (isTailCall) {
- if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall()))
+ if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
isTailCall = false;
else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
isTailCall =
@@ -5056,7 +5120,7 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
}
- if (!isTailCall && CS && CS->isMustTailCall())
+ if (!isTailCall && CS && CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
@@ -5090,7 +5154,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const {
+ ImmutableCallSite CS) const {
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
// of the 32-bit SVR4 ABI stack frame layout.
@@ -5324,7 +5388,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const {
+ ImmutableCallSite CS) const {
bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned NumOps = Outs.size();
@@ -5974,7 +6038,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
- ImmutableCallSite *CS) const {
+ ImmutableCallSite CS) const {
unsigned NumOps = Outs.size();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -7448,9 +7512,11 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
/// - The node is a "load-and-splat"
/// In all other cases, we will choose to keep the BUILD_VECTOR.
static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
- bool HasDirectMove) {
+ bool HasDirectMove,
+ bool HasP8Vector) {
EVT VecVT = V->getValueType(0);
- bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 ||
+ bool RightType = VecVT == MVT::v2f64 ||
+ (HasP8Vector && VecVT == MVT::v4f32) ||
(HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
if (!RightType)
return false;
@@ -7612,7 +7678,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
// lowered to VSX instructions under certain conditions.
// Without VSX, there is no pattern more efficient than expanding the node.
if (Subtarget.hasVSX() &&
- haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove()))
+ haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
+ Subtarget.hasP8Vector()))
return Op;
return SDValue();
}
@@ -7646,6 +7713,15 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return DAG.getBitcast(Op.getValueType(), NewBV);
return NewBV;
}
+
+ // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll
+ // detect that constant splats like v8i16: 0xABAB are really just splats
+ // of a 1-byte constant. In this case, we need to convert the node to a
+ // splat of v16i8 and a bitcast.
+ if (Op.getValueType() != MVT::v16i8)
+ return DAG.getBitcast(Op.getValueType(),
+ DAG.getConstant(SplatBits, dl, MVT::v16i8));
+
return Op;
}
@@ -7855,6 +7931,219 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return DAG.getNode(ISD::BITCAST, dl, VT, T);
}
+/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
+ SelectionDAG &DAG) const {
+ const unsigned BytesInVector = 16;
+ bool IsLE = Subtarget.isLittleEndian();
+ SDLoc dl(N);
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned ShiftElts = 0, InsertAtByte = 0;
+ bool Swap = false;
+
+ // Shifts required to get the byte we want at element 7.
+ unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
+ 0, 15, 14, 13, 12, 11, 10, 9};
+ unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8};
+
+ ArrayRef<int> Mask = N->getMask();
+ int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ // For each mask element, find out if we're just inserting something
+ // from V2 into V1 or vice versa.
+ // Possible permutations inserting an element from V2 into V1:
+ // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ // ...
+ // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
+ // Inserting from V1 into V2 will be similar, except mask range will be
+ // [16,31].
+
+ bool FoundCandidate = false;
+ // If both vector operands for the shuffle are the same vector, the mask
+ // will contain only elements from the first one and the second one will be
+ // undef.
+ unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
+ // Go through the mask of half-words to find an element that's being moved
+ // from one vector to the other.
+ for (unsigned i = 0; i < BytesInVector; ++i) {
+ unsigned CurrentElement = Mask[i];
+ // If 2nd operand is undefined, we should only look for element 7 in the
+ // Mask.
+ if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
+ continue;
+
+ bool OtherElementsInOrder = true;
+ // Examine the other elements in the Mask to see if they're in original
+ // order.
+ for (unsigned j = 0; j < BytesInVector; ++j) {
+ if (j == i)
+ continue;
+ // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
+ // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
+ // in which we always assume we're always picking from the 1st operand.
+ int MaskOffset =
+ (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
+ if (Mask[j] != OriginalOrder[j] + MaskOffset) {
+ OtherElementsInOrder = false;
+ break;
+ }
+ }
+ // If other elements are in original order, we record the number of shifts
+ // we need to get the element we want into element 7. Also record which byte
+ // in the vector we should insert into.
+ if (OtherElementsInOrder) {
+ // If 2nd operand is undefined, we assume no shifts and no swapping.
+ if (V2.isUndef()) {
+ ShiftElts = 0;
+ Swap = false;
+ } else {
+ // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
+ ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
+ : BigEndianShifts[CurrentElement & 0xF];
+ Swap = CurrentElement < BytesInVector;
+ }
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
+ FoundCandidate = true;
+ break;
+ }
+ }
+
+ if (!FoundCandidate)
+ return SDValue();
+
+ // Candidate found, construct the proper SDAG sequence with VINSERTB,
+ // optionally with VECSHL if shift is required.
+ if (Swap)
+ std::swap(V1, V2);
+ if (V2.isUndef())
+ V2 = V1;
+ if (ShiftElts) {
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+ DAG.getConstant(ShiftElts, dl, MVT::i32));
+ return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ }
+ return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+}
+
+/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
+ SelectionDAG &DAG) const {
+ const unsigned NumHalfWords = 8;
+ const unsigned BytesInVector = NumHalfWords * 2;
+ // Check that the shuffle is on half-words.
+ if (!isNByteElemShuffleMask(N, 2, 1))
+ return SDValue();
+
+ bool IsLE = Subtarget.isLittleEndian();
+ SDLoc dl(N);
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned ShiftElts = 0, InsertAtByte = 0;
+ bool Swap = false;
+
+ // Shifts required to get the half-word we want at element 3.
+ unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
+ unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
+
+ uint32_t Mask = 0;
+ uint32_t OriginalOrderLow = 0x1234567;
+ uint32_t OriginalOrderHigh = 0x89ABCDEF;
+ // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
+ // 32-bit space, only need 4-bit nibbles per element.
+ for (unsigned i = 0; i < NumHalfWords; ++i) {
+ unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+ Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
+ }
+
+ // For each mask element, find out if we're just inserting something
+ // from V2 into V1 or vice versa. Possible permutations inserting an element
+ // from V2 into V1:
+ // X, 1, 2, 3, 4, 5, 6, 7
+ // 0, X, 2, 3, 4, 5, 6, 7
+ // 0, 1, X, 3, 4, 5, 6, 7
+ // 0, 1, 2, X, 4, 5, 6, 7
+ // 0, 1, 2, 3, X, 5, 6, 7
+ // 0, 1, 2, 3, 4, X, 6, 7
+ // 0, 1, 2, 3, 4, 5, X, 7
+ // 0, 1, 2, 3, 4, 5, 6, X
+ // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
+
+ bool FoundCandidate = false;
+ // Go through the mask of half-words to find an element that's being moved
+ // from one vector to the other.
+ for (unsigned i = 0; i < NumHalfWords; ++i) {
+ unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+ uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
+ uint32_t MaskOtherElts = ~(0xF << MaskShift);
+ uint32_t TargetOrder = 0x0;
+
+ // If both vector operands for the shuffle are the same vector, the mask
+ // will contain only elements from the first one and the second one will be
+ // undef.
+ if (V2.isUndef()) {
+ ShiftElts = 0;
+ unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
+ TargetOrder = OriginalOrderLow;
+ Swap = false;
+ // Skip if not the correct element or mask of other elements don't equal
+ // to our expected order.
+ if (MaskOneElt == VINSERTHSrcElem &&
+ (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+ FoundCandidate = true;
+ break;
+ }
+ } else { // If both operands are defined.
+ // Target order is [8,15] if the current mask is between [0,7].
+ TargetOrder =
+ (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
+ // Skip if mask of other elements don't equal our expected order.
+ if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+ // We only need the last 3 bits for the number of shifts.
+ ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
+ : BigEndianShifts[MaskOneElt & 0x7];
+ InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+ Swap = MaskOneElt < NumHalfWords;
+ FoundCandidate = true;
+ break;
+ }
+ }
+ }
+
+ if (!FoundCandidate)
+ return SDValue();
+
+ // Candidate found, construct the proper SDAG sequence with VINSERTH,
+ // optionally with VECSHL if shift is required.
+ if (Swap)
+ std::swap(V1, V2);
+ if (V2.isUndef())
+ V2 = V1;
+ SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+ if (ShiftElts) {
+ // Double ShiftElts because we're left shifting on v16i8 type.
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+ DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
+ SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+ }
+ SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+}
+
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
/// return the code it can be lowered into. Worst case, it can always be
@@ -7869,7 +8158,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned ShiftElts, InsertAtByte;
- bool Swap;
+ bool Swap = false;
if (Subtarget.hasP9Vector() &&
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
isLittleEndian)) {
@@ -7880,15 +8169,23 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
if (ShiftElts) {
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
DAG.getConstant(ShiftElts, dl, MVT::i32));
- SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl,
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
- SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2,
+ SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
+ if (Subtarget.hasP9Altivec()) {
+ SDValue NewISDNode;
+ if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
+ return NewISDNode;
+
+ if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
+ return NewISDNode;
+ }
if (Subtarget.hasVSX() &&
PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
@@ -8390,6 +8687,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntrinsicID =
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+
if (IntrinsicID == Intrinsic::thread_pointer) {
// Reads the thread pointer register, used for __builtin_thread_pointer.
if (Subtarget.isPPC64())
@@ -8397,9 +8696,37 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getRegister(PPC::R2, MVT::i32);
}
+ // We are looking for absolute values here.
+ // The idea is to try to fit one of two patterns:
+ // max (a, (0-a)) OR max ((0-a), a)
+ if (Subtarget.hasP9Vector() &&
+ (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
+ IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
+ IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
+ SDValue V1 = Op.getOperand(1);
+ SDValue V2 = Op.getOperand(2);
+ if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
+ (V1.getSimpleValueType() == MVT::v4i32 ||
+ V1.getSimpleValueType() == MVT::v8i16 ||
+ V1.getSimpleValueType() == MVT::v16i8)) {
+ if ( V1.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
+ V1.getOperand(1) == V2 ) {
+ // Generate the abs instruction with the operands
+ return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
+ }
+
+ if ( V2.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
+ V2.getOperand(1) == V1 ) {
+ // Generate the abs instruction with the operands
+ return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
+ }
+ }
+ }
+
// If this is a lowered altivec predicate compare, CompareOpc is set to the
// opcode number of the comparison.
- SDLoc dl(Op);
int CompareOpc;
bool isDot;
if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
@@ -8495,6 +8822,59 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
+// Lower scalar BSWAP64 to xxbrd.
+SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ // MTVSRDD
+ Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
+ Op.getOperand(0));
+ // XXBRD
+ Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
+ // MFVSRD
+ int VectorIndex = 0;
+ if (Subtarget.isLittleEndian())
+ VectorIndex = 1;
+ Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
+ DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
+ return Op;
+}
+
+// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
+// compared to a value that is atomically loaded (atomic loads zero-extend).
+SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
+ "Expecting an atomic compare-and-swap here.");
+ SDLoc dl(Op);
+ auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
+ EVT MemVT = AtomicNode->getMemoryVT();
+ if (MemVT.getSizeInBits() >= 32)
+ return Op;
+
+ SDValue CmpOp = Op.getOperand(2);
+ // If this is already correctly zero-extended, leave it alone.
+ auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
+ if (DAG.MaskedValueIsZero(CmpOp, HighBits))
+ return Op;
+
+ // Clear the high bits of the compare operand.
+ unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
+ SDValue NewCmpOp =
+ DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
+ DAG.getConstant(MaskVal, dl, MVT::i32));
+
+ // Replace the existing compare operand with the properly zero-extended one.
+ SmallVector<SDValue, 4> Ops;
+ for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
+ Ops.push_back(AtomicNode->getOperand(i));
+ Ops[2] = NewCmpOp;
+ MachineMemOperand *MMO = AtomicNode->getMemOperand();
+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
+ auto NodeTy =
+ (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
+ return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -8539,11 +8919,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
"Should only be called for ISD::INSERT_VECTOR_ELT");
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
// We have legal lowering for constant indices but not for variable ones.
- if (C)
- return Op;
- return SDValue();
+ if (!C)
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
+ if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
+ unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
+ unsigned InsertAtElement = C->getZExtValue();
+ unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
+ if (Subtarget.isLittleEndian()) {
+ InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
+ }
+ return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
+ DAG.getConstant(InsertAtByte, dl, MVT::i32));
+ }
+ return Op;
}
SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -8966,6 +9364,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SREM:
case ISD::UREM:
return LowerREM(Op, DAG);
+ case ISD::BSWAP:
+ return LowerBSWAP(Op, DAG);
+ case ISD::ATOMIC_CMP_SWAP:
+ return LowerATOMIC_CMP_SWAP(Op, DAG);
}
}
@@ -9461,7 +9863,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
// Naked functions never have a base pointer, and so we use r1. For all
// other functions, this decision must be delayed until during PEI.
unsigned BaseReg;
- if (MF->getFunction()->hasFnAttribute(Attribute::Naked))
+ if (MF->getFunction().hasFnAttribute(Attribute::Naked))
BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
else
BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
@@ -11546,6 +11948,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
SDLoc dl(N);
SDValue Op(N, 0);
+ // Don't handle ppc_fp128 here or i1 conversions.
+ if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+ return SDValue();
+ if (Op.getOperand(0).getValueType() == MVT::i1)
+ return SDValue();
+
SDValue FirstOperand(Op.getOperand(0));
bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
(FirstOperand.getValueType() == MVT::i8 ||
@@ -11574,11 +11982,6 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
}
- // Don't handle ppc_fp128 here or i1 conversions.
- if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
- return SDValue();
- if (Op.getOperand(0).getValueType() == MVT::i1)
- return SDValue();
// For i32 intermediate values, unfortunately, the conversion functions
// leave the upper 32 bits of the value are undefined. Within the set of
@@ -11887,9 +12290,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
cast<StoreSDNode>(N)->getMemOperand());
}
+ // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
+ // So it can increase the chance of CSE constant construction.
+ EVT VT = N->getOperand(1).getValueType();
+ if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
+ isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) {
+ // Need to sign-extended to 64-bits to handle negative values.
+ EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
+ uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
+ MemVT.getSizeInBits());
+ SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
+
+ // DAG.getTruncStore() can't be used here because it doesn't accept
+ // the general (base + offset) addressing mode.
+ // So we use UpdateNodeOperands and setTruncatingStore instead.
+ DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
+ N->getOperand(3));
+ cast<StoreSDNode>(N)->setTruncatingStore(true);
+ return SDValue(N, 0);
+ }
+
// For little endian, VSX stores require generating xxswapd/lxvd2x.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
- EVT VT = N->getOperand(1).getValueType();
if (VT.isSimple()) {
MVT StoreVT = VT.getSimpleVT();
if (Subtarget.needsSwapsForVSXMemOps() &&
@@ -12690,6 +13112,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &PPC::QSRCRegClass);
if (Subtarget.hasAltivec())
return std::make_pair(0U, &PPC::VRRCRegClass);
+ break;
case 'y': // crrc
return std::make_pair(0U, &PPC::CRRCRegClass);
}
@@ -12810,7 +13233,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// by AM is legal for this target, for a load/store of the specified type.
bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+ unsigned AS, Instruction *I) const {
// PPC does not allow r+i addressing modes for vectors!
if (Ty->isVectorTy() && AM.BaseOffs != 0)
return false;
@@ -12895,7 +13318,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
// Naked functions never have a frame pointer, and so we use r1. For all
// other functions, this decision must be delayed until during PEI.
unsigned FrameReg;
- if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+ if (MF.getFunction().hasFnAttribute(Attribute::Naked))
FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
else
FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
@@ -12940,6 +13363,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
+ MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
case Intrinsic::ppc_qpx_qvlfd:
@@ -12992,9 +13416,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
Info.align = 1;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
+ Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::ppc_qpx_qvlfda:
@@ -13028,9 +13450,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = 0;
Info.size = VT.getStoreSize();
Info.align = 1;
- Info.vol = false;
- Info.readMem = true;
- Info.writeMem = false;
+ Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::ppc_qpx_qvstfd:
@@ -13082,9 +13502,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
Info.align = 1;
- Info.vol = false;
- Info.readMem = false;
- Info.writeMem = true;
+ Info.flags = MachineMemOperand::MOStore;
return true;
}
case Intrinsic::ppc_qpx_qvstfda:
@@ -13117,9 +13535,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.offset = 0;
Info.size = VT.getStoreSize();
Info.align = 1;
- Info.vol = false;
- Info.readMem = false;
- Info.writeMem = true;
+ Info.flags = MachineMemOperand::MOStore;
return true;
}
default:
@@ -13146,12 +13562,12 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
bool MemcpyStrSrc,
MachineFunction &MF) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
- const Function *F = MF.getFunction();
+ const Function &F = MF.getFunction();
// When expanding a memset, require at least two QPX instructions to cover
// the cost of loading the value to be stored from the constant pool.
if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
(!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
- !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
return MVT::v4f64;
}
@@ -13216,8 +13632,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return TargetLowering::isZExtFree(Val, VT2);
}
-bool PPCTargetLowering::isFPExtFree(EVT VT) const {
- assert(VT.isFloatingPoint());
+bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
+ assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
+ "invalid fpext types");
return true;
}
@@ -13369,7 +13786,7 @@ void PPCTargetLowering::insertCopiesSplitCSR(
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ assert(Entry->getParent()->getFunction().hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
@@ -13467,3 +13884,38 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
return SDValue();
}
+
+bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+ // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
+ if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
+ return false;
+
+ // If not a tail call then no need to proceed.
+ if (!CI->isTailCall())
+ return false;
+
+ // If tail calls are disabled for the caller then we are done.
+ const Function *Caller = CI->getParent()->getParent();
+ auto Attr = Caller->getFnAttribute("disable-tail-calls");
+ if (Attr.getValueAsString() == "true")
+ return false;
+
+ // If sibling calls have been disabled and tail-calls aren't guaranteed
+ // there is no reason to duplicate.
+ auto &TM = getTargetMachine();
+ if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
+ return false;
+
+ // Can't tail call a function called indirectly, or if it has variadic args.
+ const Function *Callee = CI->getCalledFunction();
+ if (!Callee || Callee->isVarArg())
+ return false;
+
+ // Make sure the callee and caller calling conventions are eligible for tco.
+ if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
+ CI->getCallingConv()))
+ return false;
+
+ // If the function is local then we have a good chance at tail-calling it
+ return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
+}