diff options
| author | 2018-04-06 14:26:03 +0000 | |
|---|---|---|
| committer | 2018-04-06 14:26:03 +0000 | |
| commit | bdabc2f19ffb9e20600dad6e8a300842a7bda50e (patch) | |
| tree | c50e7b2e5449b074651bb82a58517a8ebc4a8cf7 /gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp | |
| parent | Print a 'p' flag for file descriptors that were opened after pledge(2). (diff) | |
| download | wireguard-openbsd-bdabc2f19ffb9e20600dad6e8a300842a7bda50e.tar.xz wireguard-openbsd-bdabc2f19ffb9e20600dad6e8a300842a7bda50e.zip | |
Import LLVM 6.0.1 release including clang, lld and lldb.
"where is the kaboom?" deraadt@
Diffstat (limited to 'gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
| -rw-r--r-- | gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 616 |
1 files changed, 534 insertions, 82 deletions
diff --git a/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index b3a3c73f6df..f0e8b11a3d9 100644 --- a/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/gnu/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -51,6 +51,9 @@ #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" @@ -82,11 +85,8 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -114,6 +114,8 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); + // FIXME: Remove this once the bug has been fixed! extern cl::opt<bool> ANDIGlueBug; @@ -140,6 +142,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended. + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); @@ -226,6 +231,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UREM, MVT::i64, Expand); } + if (Subtarget.hasP9Vector()) { + setOperationAction(ISD::ABS, MVT::v4i32, Legal); + setOperationAction(ISD::ABS, MVT::v8i16, Legal); + setOperationAction(ISD::ABS, MVT::v16i8, Legal); + } + // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -283,14 +294,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FROUND, MVT::f32, Legal); } - // PowerPC does not have BSWAP + // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd + // to speed up scalar BSWAP64. // CTPOP or CTTZ were introduced in P8/P9 respectivelly setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - setOperationAction(ISD::BSWAP, MVT::i64 , Expand); if (Subtarget.isISA3_0()) { + setOperationAction(ISD::BSWAP, MVT::i64 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); } else { + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Expand); } @@ -773,6 +786,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRL, MVT::v1i128, Legal); setOperationAction(ISD::SRA, MVT::v1i128, Expand); } + + if (Subtarget.hasP9Altivec()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + } } if (Subtarget.hasQPX()) { @@ -1131,7 +1149,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; - case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; @@ -1139,6 +1157,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Hi: return "PPCISD::Hi"; case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; + case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8"; + case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; @@ -2413,8 +2433,8 @@ static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, - false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, + MachineMemOperand::MOLoad); } SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, @@ -2470,7 +2490,6 @@ SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, return TargetLowering::getPICJumpTableRelocBase(Table, DAG); switch (getTargetMachine().getCodeModel()) { - case CodeModel::Default: case CodeModel::Small: case CodeModel::Medium: return TargetLowering::getPICJumpTableRelocBase(Table, DAG); @@ -2488,7 +2507,6 @@ PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); switch (getTargetMachine().getCodeModel()) { - case CodeModel::Default: case CodeModel::Small: case CodeModel::Medium: return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); @@ -2560,7 +2578,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); bool is64bit = Subtarget.isPPC64(); - const Module *M = DAG.getMachineFunction().getFunction()->getParent(); + const Module *M = DAG.getMachineFunction().getFunction().getParent(); PICLevel::Level picLevel = M->getPICLevel(); TLSModel::Model Model = getTargetMachine().getTLSModel(GV); @@ -3529,7 +3547,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; unsigned &QFPR_idx = FPR_idx; SmallVector<SDValue, 8> MemOps; - Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; @@ -3614,6 +3632,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store; @@ -3648,6 +3667,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( break; unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Addr = FIN; if (j) { @@ -3684,6 +3704,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) @@ -3729,6 +3750,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( // since otherwise we never run out of FPRs before running out // of GPRs. unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); + FuncInfo->addLiveInAttr(VReg, Flags); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); if (ObjectVT == MVT::f32) { @@ -3969,7 +3991,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_Darwin( SmallVector<SDValue, 8> MemOps; unsigned nAltivecParamsAtEnd = 0; - Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { SDValue ArgVal; @@ -4251,13 +4273,25 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, static bool isFunctionGlobalAddress(SDValue Callee); static bool -resideInSameSection(const Function *Caller, SDValue Callee, +callsShareTOCBase(const Function *Caller, SDValue Callee, const TargetMachine &TM) { // If !G, Callee can be an external symbol. GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); if (!G) return false; + // The medium and large code models are expected to provide a sufficiently + // large TOC to provide all data addressing needs of a module with a + // single TOC. Since each module will be addressed with a single TOC then we + // only need to check that caller and callee don't cross dso boundaries. + if (CodeModel::Medium == TM.getCodeModel() || + CodeModel::Large == TM.getCodeModel()) + return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); + + // Otherwise we need to ensure callee and caller are in the same section, + // since the linker may allocate multiple TOCs, and we don't know which + // sections will belong to the same TOC base. + const GlobalValue *GV = G->getGlobal(); if (!GV->isStrongDefinitionForLinker()) return false; @@ -4335,12 +4369,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, } static bool -hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { - if (CS->arg_size() != CallerFn->arg_size()) +hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { + if (CS.arg_size() != CallerFn->arg_size()) return false; - ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); - ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end(); + ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin(); + ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end(); Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { @@ -4363,11 +4397,30 @@ hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { return true; } +// Returns true if TCO is possible between the callers and callees +// calling conventions. +static bool +areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, + CallingConv::ID CalleeCC) { + // Tail calls are possible with fastcc and ccc. + auto isTailCallableCC = [] (CallingConv::ID CC){ + return CC == CallingConv::C || CC == CallingConv::Fast; + }; + if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC)) + return false; + + // We can safely tail call both fastcc and ccc callees from a c calling + // convention caller. If the caller is fastcc, we may have less stack space + // than a non-fastcc caller with the same signature so disable tail-calls in + // that case. + return CallerCC == CallingConv::C || CallerCC == CalleeCC; +} + bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( SDValue Callee, CallingConv::ID CalleeCC, - ImmutableCallSite *CS, + ImmutableCallSite CS, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -4379,15 +4432,9 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // Variadic argument functions are not supported. if (isVarArg) return false; - MachineFunction &MF = DAG.getMachineFunction(); - CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); - - // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has - // the same calling convention - if (CallerCC != CalleeCC) return false; - - // SCO support C calling convention - if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) + auto &Caller = DAG.getMachineFunction().getFunction(); + // Check that the calling conventions are compatible for tco. + if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC)) return false; // Caller contains any byval parameter is not supported. @@ -4397,20 +4444,37 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // Callee contains any byval parameter is not supported, too. // Note: This is a quick work around, because in some cases, e.g. // caller's stack size > callee's stack size, we are still able to apply - // sibling call optimization. See: https://reviews.llvm.org/D23441#513574 + // sibling call optimization. For example, gcc is able to do SCO for caller1 + // in the following example, but not for caller2. + // struct test { + // long int a; + // char ary[56]; + // } gTest; + // __attribute__((noinline)) int callee(struct test v, struct test *b) { + // b->a = v.a; + // return 0; + // } + // void caller1(struct test a, struct test c, struct test *b) { + // callee(gTest, b); } + // void caller2(struct test *b) { callee(gTest, b); } if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) return false; + // If callee and caller use different calling conventions, we cannot pass + // parameters on stack since offsets for the parameter area may be different. + if (Caller.getCallingConv() != CalleeCC && + needStackSlotPassParameters(Subtarget, Outs)) + return false; + // No TCO/SCO on indirect call because Caller have to restore its TOC if (!isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) return false; - // Check if Callee resides in the same section, because for now, PPC64 SVR4 - // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another - // section. + // If the caller and callee potentially have different TOC bases then we + // cannot tail call since we need to restore the TOC pointer after the call. // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 - if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine())) + if (!callsShareTOCBase(&Caller, Callee, getTargetMachine())) return false; // TCO allows altering callee ABI, so we don't have to check further. @@ -4422,7 +4486,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // If callee use the same argument list that caller is using, then we can // apply SCO on this case. If it is not, then we need to check if callee needs // stack for passing arguments. - if (!hasSameArgumentList(MF.getFunction(), CS) && + if (!hasSameArgumentList(&Caller, CS) && needStackSlotPassParameters(Subtarget, Outs)) { return false; } @@ -4447,7 +4511,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; MachineFunction &MF = DAG.getMachineFunction(); - CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); + CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { // Functions containing by val parameters are not supported. for (unsigned i = 0; i != Ins.size(); i++) { @@ -4676,7 +4740,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, bool isPatchPoint, bool hasNest, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, - ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { + ImmutableCallSite CS, const PPCSubtarget &Subtarget) { bool isPPC64 = Subtarget.isPPC64(); bool isSVR4ABI = Subtarget.isSVR4ABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); @@ -4699,7 +4763,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, // we're building with the leopard linker or later, which automatically // synthesizes these stubs. const TargetMachine &TM = DAG.getTarget(); - const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); const GlobalValue *GV = nullptr; if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) GV = G->getGlobal(); @@ -4787,7 +4851,7 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, MachineMemOperand::MOInvariant) : MachineMemOperand::MONone; - MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr); + MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, /* Alignment = */ 8, MMOFlags); @@ -4917,7 +4981,7 @@ SDValue PPCTargetLowering::FinishCall( SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag, SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, - SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { + SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const { std::vector<EVT> NodeTys; SmallVector<SDValue, 8> Ops; unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, @@ -4992,7 +5056,7 @@ SDValue PPCTargetLowering::FinishCall( // any other variadic arguments). Ops.insert(std::next(Ops.begin()), AddTOC); } else if (CallOpc == PPCISD::CALL && - !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) { + !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) { // Otherwise insert NOP for non-local calls. CallOpc = PPCISD::CALL_NOP; } @@ -5025,10 +5089,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallingConv::ID CallConv = CLI.CallConv; bool isVarArg = CLI.IsVarArg; bool isPatchPoint = CLI.IsPatchPoint; - ImmutableCallSite *CS = CLI.CS; + ImmutableCallSite CS = CLI.CS; if (isTailCall) { - if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall())) + if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall())) isTailCall = false; else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) isTailCall = @@ -5056,7 +5120,7 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (!isTailCall && CS && CS->isMustTailCall()) + if (!isTailCall && CS && CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -5090,7 +5154,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, - ImmutableCallSite *CS) const { + ImmutableCallSite CS) const { // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description // of the 32-bit SVR4 ABI stack frame layout. @@ -5324,7 +5388,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, - ImmutableCallSite *CS) const { + ImmutableCallSite CS) const { bool isELFv2ABI = Subtarget.isELFv2ABI(); bool isLittleEndian = Subtarget.isLittleEndian(); unsigned NumOps = Outs.size(); @@ -5974,7 +6038,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin( const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, - ImmutableCallSite *CS) const { + ImmutableCallSite CS) const { unsigned NumOps = Outs.size(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -7448,9 +7512,11 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, /// - The node is a "load-and-splat" /// In all other cases, we will choose to keep the BUILD_VECTOR. static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, - bool HasDirectMove) { + bool HasDirectMove, + bool HasP8Vector) { EVT VecVT = V->getValueType(0); - bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 || + bool RightType = VecVT == MVT::v2f64 || + (HasP8Vector && VecVT == MVT::v4f32) || (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); if (!RightType) return false; @@ -7612,7 +7678,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. if (Subtarget.hasVSX() && - haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove())) + haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(), + Subtarget.hasP8Vector())) return Op; return SDValue(); } @@ -7646,6 +7713,15 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getBitcast(Op.getValueType(), NewBV); return NewBV; } + + // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll + // detect that constant splats like v8i16: 0xABAB are really just splats + // of a 1-byte constant. In this case, we need to convert the node to a + // splat of v16i8 and a bitcast. + if (Op.getValueType() != MVT::v16i8) + return DAG.getBitcast(Op.getValueType(), + DAG.getConstant(SplatBits, dl, MVT::v16i8)); + return Op; } @@ -7855,6 +7931,219 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, return DAG.getNode(ISD::BITCAST, dl, VT, T); } +/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled +/// by the VINSERTB instruction introduced in ISA 3.0, else just return default +/// SDValue. +SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N, + SelectionDAG &DAG) const { + const unsigned BytesInVector = 16; + bool IsLE = Subtarget.isLittleEndian(); + SDLoc dl(N); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned ShiftElts = 0, InsertAtByte = 0; + bool Swap = false; + + // Shifts required to get the byte we want at element 7. + unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1, + 0, 15, 14, 13, 12, 11, 10, 9}; + unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0, + 1, 2, 3, 4, 5, 6, 7, 8}; + + ArrayRef<int> Mask = N->getMask(); + int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + // For each mask element, find out if we're just inserting something + // from V2 into V1 or vice versa. + // Possible permutations inserting an element from V2 into V1: + // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // ... + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X + // Inserting from V1 into V2 will be similar, except mask range will be + // [16,31]. + + bool FoundCandidate = false; + // If both vector operands for the shuffle are the same vector, the mask + // will contain only elements from the first one and the second one will be + // undef. + unsigned VINSERTBSrcElem = IsLE ? 8 : 7; + // Go through the mask of half-words to find an element that's being moved + // from one vector to the other. + for (unsigned i = 0; i < BytesInVector; ++i) { + unsigned CurrentElement = Mask[i]; + // If 2nd operand is undefined, we should only look for element 7 in the + // Mask. + if (V2.isUndef() && CurrentElement != VINSERTBSrcElem) + continue; + + bool OtherElementsInOrder = true; + // Examine the other elements in the Mask to see if they're in original + // order. + for (unsigned j = 0; j < BytesInVector; ++j) { + if (j == i) + continue; + // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be + // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined, + // in which we always assume we're always picking from the 1st operand. + int MaskOffset = + (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0; + if (Mask[j] != OriginalOrder[j] + MaskOffset) { + OtherElementsInOrder = false; + break; + } + } + // If other elements are in original order, we record the number of shifts + // we need to get the element we want into element 7. Also record which byte + // in the vector we should insert into. + if (OtherElementsInOrder) { + // If 2nd operand is undefined, we assume no shifts and no swapping. + if (V2.isUndef()) { + ShiftElts = 0; + Swap = false; + } else { + // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4. + ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF] + : BigEndianShifts[CurrentElement & 0xF]; + Swap = CurrentElement < BytesInVector; + } + InsertAtByte = IsLE ? BytesInVector - (i + 1) : i; + FoundCandidate = true; + break; + } + } + + if (!FoundCandidate) + return SDValue(); + + // Candidate found, construct the proper SDAG sequence with VINSERTB, + // optionally with VECSHL if shift is required. + if (Swap) + std::swap(V1, V2); + if (V2.isUndef()) + V2 = V1; + if (ShiftElts) { + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + } + return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); +} + +/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled +/// by the VINSERTH instruction introduced in ISA 3.0, else just return default +/// SDValue. +SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N, + SelectionDAG &DAG) const { + const unsigned NumHalfWords = 8; + const unsigned BytesInVector = NumHalfWords * 2; + // Check that the shuffle is on half-words. + if (!isNByteElemShuffleMask(N, 2, 1)) + return SDValue(); + + bool IsLE = Subtarget.isLittleEndian(); + SDLoc dl(N); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned ShiftElts = 0, InsertAtByte = 0; + bool Swap = false; + + // Shifts required to get the half-word we want at element 3. + unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5}; + unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4}; + + uint32_t Mask = 0; + uint32_t OriginalOrderLow = 0x1234567; + uint32_t OriginalOrderHigh = 0x89ABCDEF; + // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a + // 32-bit space, only need 4-bit nibbles per element. + for (unsigned i = 0; i < NumHalfWords; ++i) { + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift); + } + + // For each mask element, find out if we're just inserting something + // from V2 into V1 or vice versa. Possible permutations inserting an element + // from V2 into V1: + // X, 1, 2, 3, 4, 5, 6, 7 + // 0, X, 2, 3, 4, 5, 6, 7 + // 0, 1, X, 3, 4, 5, 6, 7 + // 0, 1, 2, X, 4, 5, 6, 7 + // 0, 1, 2, 3, X, 5, 6, 7 + // 0, 1, 2, 3, 4, X, 6, 7 + // 0, 1, 2, 3, 4, 5, X, 7 + // 0, 1, 2, 3, 4, 5, 6, X + // Inserting from V1 into V2 will be similar, except mask range will be [8,15]. + + bool FoundCandidate = false; + // Go through the mask of half-words to find an element that's being moved + // from one vector to the other. + for (unsigned i = 0; i < NumHalfWords; ++i) { + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; + uint32_t MaskOtherElts = ~(0xF << MaskShift); + uint32_t TargetOrder = 0x0; + + // If both vector operands for the shuffle are the same vector, the mask + // will contain only elements from the first one and the second one will be + // undef. + if (V2.isUndef()) { + ShiftElts = 0; + unsigned VINSERTHSrcElem = IsLE ? 4 : 3; + TargetOrder = OriginalOrderLow; + Swap = false; + // Skip if not the correct element or mask of other elements don't equal + // to our expected order. + if (MaskOneElt == VINSERTHSrcElem && + (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { + InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; + FoundCandidate = true; + break; + } + } else { // If both operands are defined. + // Target order is [8,15] if the current mask is between [0,7]. + TargetOrder = + (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow; + // Skip if mask of other elements don't equal our expected order. + if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { + // We only need the last 3 bits for the number of shifts. + ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7] + : BigEndianShifts[MaskOneElt & 0x7]; + InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; + Swap = MaskOneElt < NumHalfWords; + FoundCandidate = true; + break; + } + } + } + + if (!FoundCandidate) + return SDValue(); + + // Candidate found, construct the proper SDAG sequence with VINSERTH, + // optionally with VECSHL if shift is required. + if (Swap) + std::swap(V1, V2); + if (V2.isUndef()) + V2 = V1; + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + if (ShiftElts) { + // Double ShiftElts because we're left shifting on v16i8 type. + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, + DAG.getConstant(2 * ShiftElts, dl, MVT::i32)); + SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl); + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); + } + SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); +} + /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this /// is a shuffle we can handle in a single instruction, return it. Otherwise, /// return the code it can be lowered into. Worst case, it can always be @@ -7869,7 +8158,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, bool isLittleEndian = Subtarget.isLittleEndian(); unsigned ShiftElts, InsertAtByte; - bool Swap; + bool Swap = false; if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { @@ -7880,15 +8169,23 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (ShiftElts) { SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); - SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl, + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } - SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2, + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + if (Subtarget.hasP9Altivec()) { + SDValue NewISDNode; + if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) + return NewISDNode; + + if ((NewISDNode = lowerToVINSERTB(SVOp, DAG))) + return NewISDNode; + } if (Subtarget.hasVSX() && PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { @@ -8390,6 +8687,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + if (IntrinsicID == Intrinsic::thread_pointer) { // Reads the thread pointer register, used for __builtin_thread_pointer. if (Subtarget.isPPC64()) @@ -8397,9 +8696,37 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getRegister(PPC::R2, MVT::i32); } + // We are looking for absolute values here. + // The idea is to try to fit one of two patterns: + // max (a, (0-a)) OR max ((0-a), a) + if (Subtarget.hasP9Vector() && + (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || + IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || + IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { + SDValue V1 = Op.getOperand(1); + SDValue V2 = Op.getOperand(2); + if (V1.getSimpleValueType() == V2.getSimpleValueType() && + (V1.getSimpleValueType() == MVT::v4i32 || + V1.getSimpleValueType() == MVT::v8i16 || + V1.getSimpleValueType() == MVT::v16i8)) { + if ( V1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && + V1.getOperand(1) == V2 ) { + // Generate the abs instruction with the operands + return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); + } + + if ( V2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && + V2.getOperand(1) == V1 ) { + // Generate the abs instruction with the operands + return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); + } + } + } + // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. - SDLoc dl(Op); int CompareOpc; bool isDot; if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) @@ -8495,6 +8822,59 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { return Op; } +// Lower scalar BSWAP64 to xxbrd. +SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + // MTVSRDD + Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0), + Op.getOperand(0)); + // XXBRD + Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op); + // MFVSRD + int VectorIndex = 0; + if (Subtarget.isLittleEndian()) + VectorIndex = 1; + Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, + DAG.getTargetConstant(VectorIndex, dl, MVT::i32)); + return Op; +} + +// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be +// compared to a value that is atomically loaded (atomic loads zero-extend). +SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP && + "Expecting an atomic compare-and-swap here."); + SDLoc dl(Op); + auto *AtomicNode = cast<AtomicSDNode>(Op.getNode()); + EVT MemVT = AtomicNode->getMemoryVT(); + if (MemVT.getSizeInBits() >= 32) + return Op; + + SDValue CmpOp = Op.getOperand(2); + // If this is already correctly zero-extended, leave it alone. + auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits()); + if (DAG.MaskedValueIsZero(CmpOp, HighBits)) + return Op; + + // Clear the high bits of the compare operand. + unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1; + SDValue NewCmpOp = + DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp, + DAG.getConstant(MaskVal, dl, MVT::i32)); + + // Replace the existing compare operand with the properly zero-extended one. + SmallVector<SDValue, 4> Ops; + for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++) + Ops.push_back(AtomicNode->getOperand(i)); + Ops[2] = NewCmpOp; + MachineMemOperand *MMO = AtomicNode->getMemOperand(); + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other); + auto NodeTy = + (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16; + return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); +} + SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8539,11 +8919,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Should only be called for ISD::INSERT_VECTOR_ELT"); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2)); // We have legal lowering for constant indices but not for variable ones. - if (C) - return Op; - return SDValue(); + if (!C) + return SDValue(); + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types. + if (VT == MVT::v8i16 || VT == MVT::v16i8) { + SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2); + unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8; + unsigned InsertAtElement = C->getZExtValue(); + unsigned InsertAtByte = InsertAtElement * BytesInEachElement; + if (Subtarget.isLittleEndian()) { + InsertAtByte = (16 - BytesInEachElement) - InsertAtByte; + } + return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + } + return Op; } SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, @@ -8966,6 +9364,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SREM: case ISD::UREM: return LowerREM(Op, DAG); + case ISD::BSWAP: + return LowerBSWAP(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: + return LowerATOMIC_CMP_SWAP(Op, DAG); } } @@ -9461,7 +9863,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Naked functions never have a base pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned BaseReg; - if (MF->getFunction()->hasFnAttribute(Attribute::Naked)) + if (MF->getFunction().hasFnAttribute(Attribute::Naked)) BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; else BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; @@ -11546,6 +11948,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, SDLoc dl(N); SDValue Op(N, 0); + // Don't handle ppc_fp128 here or i1 conversions. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + if (Op.getOperand(0).getValueType() == MVT::i1) + return SDValue(); + SDValue FirstOperand(Op.getOperand(0)); bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && (FirstOperand.getValueType() == MVT::i8 || @@ -11574,11 +11982,6 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); } - // Don't handle ppc_fp128 here or i1 conversions. - if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) - return SDValue(); - if (Op.getOperand(0).getValueType() == MVT::i1) - return SDValue(); // For i32 intermediate values, unfortunately, the conversion functions // leave the upper 32 bits of the value are undefined. Within the set of @@ -11887,9 +12290,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, cast<StoreSDNode>(N)->getMemOperand()); } + // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0> + // So it can increase the chance of CSE constant construction. + EVT VT = N->getOperand(1).getValueType(); + if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() && + isa<ConstantSDNode>(N->getOperand(1)) && VT == MVT::i32) { + // Need to sign-extended to 64-bits to handle negative values. + EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT(); + uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1), + MemVT.getSizeInBits()); + SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64); + + // DAG.getTruncStore() can't be used here because it doesn't accept + // the general (base + offset) addressing mode. + // So we use UpdateNodeOperands and setTruncatingStore instead. + DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2), + N->getOperand(3)); + cast<StoreSDNode>(N)->setTruncatingStore(true); + return SDValue(N, 0); + } + // For little endian, VSX stores require generating xxswapd/lxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. - EVT VT = N->getOperand(1).getValueType(); if (VT.isSimple()) { MVT StoreVT = VT.getSimpleVT(); if (Subtarget.needsSwapsForVSXMemOps() && @@ -12690,6 +13112,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); + break; case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); } @@ -12810,7 +13233,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const { + unsigned AS, Instruction *I) const { // PPC does not allow r+i addressing modes for vectors! if (Ty->isVectorTy() && AM.BaseOffs != 0) return false; @@ -12895,7 +13318,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // Naked functions never have a frame pointer, and so we use r1. For all // other functions, this decision must be delayed until during PEI. unsigned FrameReg; - if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) + if (MF.getFunction().hasFnAttribute(Attribute::Naked)) FrameReg = isPPC64 ? PPC::X1 : PPC::R1; else FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; @@ -12940,6 +13363,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::ppc_qpx_qvlfd: @@ -12992,9 +13416,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; + Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::ppc_qpx_qvlfda: @@ -13028,9 +13450,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; + Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::ppc_qpx_qvstfd: @@ -13082,9 +13502,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = -VT.getStoreSize()+1; Info.size = 2*VT.getStoreSize()-1; Info.align = 1; - Info.vol = false; - Info.readMem = false; - Info.writeMem = true; + Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::ppc_qpx_qvstfda: @@ -13117,9 +13535,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.offset = 0; Info.size = VT.getStoreSize(); Info.align = 1; - Info.vol = false; - Info.readMem = false; - Info.writeMem = true; + Info.flags = MachineMemOperand::MOStore; return true; } default: @@ -13146,12 +13562,12 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, bool MemcpyStrSrc, MachineFunction &MF) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - const Function *F = MF.getFunction(); + const Function &F = MF.getFunction(); // When expanding a memset, require at least two QPX instructions to cover // the cost of loading the value to be stored from the constant pool. if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && - !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + !F.hasFnAttribute(Attribute::NoImplicitFloat)) { return MVT::v4f64; } @@ -13216,8 +13632,9 @@ bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return TargetLowering::isZExtFree(Val, VT2); } -bool PPCTargetLowering::isFPExtFree(EVT VT) const { - assert(VT.isFloatingPoint()); +bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const { + assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && + "invalid fpext types"); return true; } @@ -13369,7 +13786,7 @@ void PPCTargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction()->hasFnAttribute( + assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); @@ -13467,3 +13884,38 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { return SDValue(); } + +bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { + // Only duplicate to increase tail-calls for the 64bit SysV ABIs. + if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) + return false; + + // If not a tail call then no need to proceed. + if (!CI->isTailCall()) + return false; + + // If tail calls are disabled for the caller then we are done. + const Function *Caller = CI->getParent()->getParent(); + auto Attr = Caller->getFnAttribute("disable-tail-calls"); + if (Attr.getValueAsString() == "true") + return false; + + // If sibling calls have been disabled and tail-calls aren't guaranteed + // there is no reason to duplicate. + auto &TM = getTargetMachine(); + if (!TM.Options.GuaranteedTailCallOpt && DisableSCO) + return false; + + // Can't tail call a function called indirectly, or if it has variadic args. + const Function *Callee = CI->getCalledFunction(); + if (!Callee || Callee->isVarArg()) + return false; + + // Make sure the callee and caller calling conventions are eligible for tco. + if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(), + CI->getCallingConv())) + return false; + + // If the function is local then we have a good chance at tail-calling it + return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); +} |
