diff options
Diffstat (limited to 'gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp')
| -rw-r--r-- | gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp | 3078 |
1 files changed, 2535 insertions, 543 deletions
diff --git a/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp b/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp index 35ae114c4f2..0770c204dab 100644 --- a/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp +++ b/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp @@ -14,6 +14,7 @@ #include "CGCXXABI.h" #include "CGObjCRuntime.h" #include "CGOpenCLRuntime.h" +#include "CGRecordLayout.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "ConstantEmitter.h" @@ -188,7 +189,7 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF, return RValue::get(Result); } -/// @brief Utility to insert an atomic cmpxchg instruction. +/// Utility to insert an atomic cmpxchg instruction. /// /// @param CGF The current codegen function. /// @param E Builtin call expression to convert to cmpxchg. @@ -319,7 +320,7 @@ static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD, return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot()); } -/// \brief Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.* +/// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.* /// depending on IntrinsicID. /// /// \arg CGF The current codegen function. @@ -384,7 +385,7 @@ EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) { } // The encompassing type must have a width greater than or equal to the width - // of the specified types. Aditionally, if the encompassing type is signed, + // of the specified types. Additionally, if the encompassing type is signed, // its width must be strictly greater than the width of any unsigned types // given. unsigned Width = 0; @@ -478,13 +479,261 @@ CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type, // LLVM only supports 0 and 2, make sure that we pass along that as a boolean. Value *Min = Builder.getInt1((Type & 2) != 0); - // For GCC compatability, __builtin_object_size treat NULL as unknown size. + // For GCC compatibility, __builtin_object_size treat NULL as unknown size. Value *NullIsUnknown = Builder.getTrue(); return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown}); } -// Many of MSVC builtins are on both x64 and ARM; to avoid repeating code, we -// handle them here. +namespace { +/// A struct to generically desribe a bit test intrinsic. +struct BitTest { + enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set }; + enum InterlockingKind : uint8_t { + Unlocked, + Sequential, + Acquire, + Release, + NoFence + }; + + ActionKind Action; + InterlockingKind Interlocking; + bool Is64Bit; + + static BitTest decodeBitTestBuiltin(unsigned BuiltinID); +}; +} // namespace + +BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) { + switch (BuiltinID) { + // Main portable variants. + case Builtin::BI_bittest: + return {TestOnly, Unlocked, false}; + case Builtin::BI_bittestandcomplement: + return {Complement, Unlocked, false}; + case Builtin::BI_bittestandreset: + return {Reset, Unlocked, false}; + case Builtin::BI_bittestandset: + return {Set, Unlocked, false}; + case Builtin::BI_interlockedbittestandreset: + return {Reset, Sequential, false}; + case Builtin::BI_interlockedbittestandset: + return {Set, Sequential, false}; + + // X86-specific 64-bit variants. + case Builtin::BI_bittest64: + return {TestOnly, Unlocked, true}; + case Builtin::BI_bittestandcomplement64: + return {Complement, Unlocked, true}; + case Builtin::BI_bittestandreset64: + return {Reset, Unlocked, true}; + case Builtin::BI_bittestandset64: + return {Set, Unlocked, true}; + case Builtin::BI_interlockedbittestandreset64: + return {Reset, Sequential, true}; + case Builtin::BI_interlockedbittestandset64: + return {Set, Sequential, true}; + + // ARM/AArch64-specific ordering variants. + case Builtin::BI_interlockedbittestandset_acq: + return {Set, Acquire, false}; + case Builtin::BI_interlockedbittestandset_rel: + return {Set, Release, false}; + case Builtin::BI_interlockedbittestandset_nf: + return {Set, NoFence, false}; + case Builtin::BI_interlockedbittestandreset_acq: + return {Reset, Acquire, false}; + case Builtin::BI_interlockedbittestandreset_rel: + return {Reset, Release, false}; + case Builtin::BI_interlockedbittestandreset_nf: + return {Reset, NoFence, false}; + } + llvm_unreachable("expected only bittest intrinsics"); +} + +static char bitActionToX86BTCode(BitTest::ActionKind A) { + switch (A) { + case BitTest::TestOnly: return '\0'; + case BitTest::Complement: return 'c'; + case BitTest::Reset: return 'r'; + case BitTest::Set: return 's'; + } + llvm_unreachable("invalid action"); +} + +static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF, + BitTest BT, + const CallExpr *E, Value *BitBase, + Value *BitPos) { + char Action = bitActionToX86BTCode(BT.Action); + char SizeSuffix = BT.Is64Bit ? 'q' : 'l'; + + // Build the assembly. + SmallString<64> Asm; + raw_svector_ostream AsmOS(Asm); + if (BT.Interlocking != BitTest::Unlocked) + AsmOS << "lock "; + AsmOS << "bt"; + if (Action) + AsmOS << Action; + AsmOS << SizeSuffix << " $2, ($1)\n\tsetc ${0:b}"; + + // Build the constraints. FIXME: We should support immediates when possible. + std::string Constraints = "=r,r,r,~{cc},~{flags},~{fpsr}"; + llvm::IntegerType *IntType = llvm::IntegerType::get( + CGF.getLLVMContext(), + CGF.getContext().getTypeSize(E->getArg(1)->getType())); + llvm::Type *IntPtrType = IntType->getPointerTo(); + llvm::FunctionType *FTy = + llvm::FunctionType::get(CGF.Int8Ty, {IntPtrType, IntType}, false); + + llvm::InlineAsm *IA = + llvm::InlineAsm::get(FTy, Asm, Constraints, /*SideEffects=*/true); + return CGF.Builder.CreateCall(IA, {BitBase, BitPos}); +} + +static llvm::AtomicOrdering +getBitTestAtomicOrdering(BitTest::InterlockingKind I) { + switch (I) { + case BitTest::Unlocked: return llvm::AtomicOrdering::NotAtomic; + case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent; + case BitTest::Acquire: return llvm::AtomicOrdering::Acquire; + case BitTest::Release: return llvm::AtomicOrdering::Release; + case BitTest::NoFence: return llvm::AtomicOrdering::Monotonic; + } + llvm_unreachable("invalid interlocking"); +} + +/// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of +/// bits and a bit position and read and optionally modify the bit at that +/// position. The position index can be arbitrarily large, i.e. it can be larger +/// than 31 or 63, so we need an indexed load in the general case. +static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF, + unsigned BuiltinID, + const CallExpr *E) { + Value *BitBase = CGF.EmitScalarExpr(E->getArg(0)); + Value *BitPos = CGF.EmitScalarExpr(E->getArg(1)); + + BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID); + + // X86 has special BT, BTC, BTR, and BTS instructions that handle the array + // indexing operation internally. Use them if possible. + llvm::Triple::ArchType Arch = CGF.getTarget().getTriple().getArch(); + if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64) + return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos); + + // Otherwise, use generic code to load one byte and test the bit. Use all but + // the bottom three bits as the array index, and the bottom three bits to form + // a mask. + // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0; + Value *ByteIndex = CGF.Builder.CreateAShr( + BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx"); + Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy); + Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8, + ByteIndex, "bittest.byteaddr"), + CharUnits::One()); + Value *PosLow = + CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty), + llvm::ConstantInt::get(CGF.Int8Ty, 0x7)); + + // The updating instructions will need a mask. + Value *Mask = nullptr; + if (BT.Action != BitTest::TestOnly) { + Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow, + "bittest.mask"); + } + + // Check the action and ordering of the interlocked intrinsics. + llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking); + + Value *OldByte = nullptr; + if (Ordering != llvm::AtomicOrdering::NotAtomic) { + // Emit a combined atomicrmw load/store operation for the interlocked + // intrinsics. + llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or; + if (BT.Action == BitTest::Reset) { + Mask = CGF.Builder.CreateNot(Mask); + RMWOp = llvm::AtomicRMWInst::And; + } + OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr.getPointer(), Mask, + Ordering); + } else { + // Emit a plain load for the non-interlocked intrinsics. + OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte"); + Value *NewByte = nullptr; + switch (BT.Action) { + case BitTest::TestOnly: + // Don't store anything. + break; + case BitTest::Complement: + NewByte = CGF.Builder.CreateXor(OldByte, Mask); + break; + case BitTest::Reset: + NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask)); + break; + case BitTest::Set: + NewByte = CGF.Builder.CreateOr(OldByte, Mask); + break; + } + if (NewByte) + CGF.Builder.CreateStore(NewByte, ByteAddr); + } + + // However we loaded the old byte, either by plain load or atomicrmw, shift + // the bit into the low position and mask it to 0 or 1. + Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr"); + return CGF.Builder.CreateAnd( + ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res"); +} + +namespace { +enum class MSVCSetJmpKind { + _setjmpex, + _setjmp3, + _setjmp +}; +} + +/// MSVC handles setjmp a bit differently on different platforms. On every +/// architecture except 32-bit x86, the frame address is passed. On x86, extra +/// parameters can be passed as variadic arguments, but we always pass none. +static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind, + const CallExpr *E) { + llvm::Value *Arg1 = nullptr; + llvm::Type *Arg1Ty = nullptr; + StringRef Name; + bool IsVarArg = false; + if (SJKind == MSVCSetJmpKind::_setjmp3) { + Name = "_setjmp3"; + Arg1Ty = CGF.Int32Ty; + Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0); + IsVarArg = true; + } else { + Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex"; + Arg1Ty = CGF.Int8PtrTy; + Arg1 = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(Intrinsic::frameaddress), + llvm::ConstantInt::get(CGF.Int32Ty, 0)); + } + + // Mark the call site and declaration with ReturnsTwice. + llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty}; + llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get( + CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex, + llvm::Attribute::ReturnsTwice); + llvm::Constant *SetJmpFn = CGF.CGM.CreateRuntimeFunction( + llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name, + ReturnsTwiceAttr, /*Local=*/true); + + llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast( + CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy); + llvm::Value *Args[] = {Buf, Arg1}; + llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args); + CS.setAttributes(ReturnsTwiceAttr); + return RValue::get(CS.getInstruction()); +} + +// Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code, +// we handle them here. enum class CodeGenFunction::MSVCIntrin { _BitScanForward, _BitScanReverse, @@ -496,7 +745,6 @@ enum class CodeGenFunction::MSVCIntrin { _InterlockedIncrement, _InterlockedOr, _InterlockedXor, - _interlockedbittestandset, __fastfail, }; @@ -564,22 +812,6 @@ Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, case MSVCIntrin::_InterlockedXor: return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E); - case MSVCIntrin::_interlockedbittestandset: { - llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); - llvm::Value *Bit = EmitScalarExpr(E->getArg(1)); - AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( - AtomicRMWInst::Or, Addr, - Builder.CreateShl(ConstantInt::get(Bit->getType(), 1), Bit), - llvm::AtomicOrdering::SequentiallyConsistent); - // Shift the relevant bit to the least significant position, truncate to - // the result type, and test the low bit. - llvm::Value *Shifted = Builder.CreateLShr(RMWI, Bit); - llvm::Value *Truncated = - Builder.CreateTrunc(Shifted, ConvertType(E->getType())); - return Builder.CreateAnd(Truncated, - ConstantInt::get(Truncated->getType(), 1)); - } - case MSVCIntrin::_InterlockedDecrement: { llvm::Type *IntTy = ConvertType(E->getType()); AtomicRMWInst *RMWI = Builder.CreateAtomicRMW( @@ -930,6 +1162,96 @@ EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1, return RValue::get(Overflow); } +static llvm::Value *dumpRecord(CodeGenFunction &CGF, QualType RType, + Value *&RecordPtr, CharUnits Align, Value *Func, + int Lvl) { + const auto *RT = RType->getAs<RecordType>(); + ASTContext &Context = CGF.getContext(); + RecordDecl *RD = RT->getDecl()->getDefinition(); + ASTContext &Ctx = RD->getASTContext(); + const ASTRecordLayout &RL = Ctx.getASTRecordLayout(RD); + std::string Pad = std::string(Lvl * 4, ' '); + + Value *GString = + CGF.Builder.CreateGlobalStringPtr(RType.getAsString() + " {\n"); + Value *Res = CGF.Builder.CreateCall(Func, {GString}); + + static llvm::DenseMap<QualType, const char *> Types; + if (Types.empty()) { + Types[Context.CharTy] = "%c"; + Types[Context.BoolTy] = "%d"; + Types[Context.SignedCharTy] = "%hhd"; + Types[Context.UnsignedCharTy] = "%hhu"; + Types[Context.IntTy] = "%d"; + Types[Context.UnsignedIntTy] = "%u"; + Types[Context.LongTy] = "%ld"; + Types[Context.UnsignedLongTy] = "%lu"; + Types[Context.LongLongTy] = "%lld"; + Types[Context.UnsignedLongLongTy] = "%llu"; + Types[Context.ShortTy] = "%hd"; + Types[Context.UnsignedShortTy] = "%hu"; + Types[Context.VoidPtrTy] = "%p"; + Types[Context.FloatTy] = "%f"; + Types[Context.DoubleTy] = "%f"; + Types[Context.LongDoubleTy] = "%Lf"; + Types[Context.getPointerType(Context.CharTy)] = "%s"; + Types[Context.getPointerType(Context.getConstType(Context.CharTy))] = "%s"; + } + + for (const auto *FD : RD->fields()) { + uint64_t Off = RL.getFieldOffset(FD->getFieldIndex()); + Off = Ctx.toCharUnitsFromBits(Off).getQuantity(); + + Value *FieldPtr = RecordPtr; + if (RD->isUnion()) + FieldPtr = CGF.Builder.CreatePointerCast( + FieldPtr, CGF.ConvertType(Context.getPointerType(FD->getType()))); + else + FieldPtr = CGF.Builder.CreateStructGEP(CGF.ConvertType(RType), FieldPtr, + FD->getFieldIndex()); + + GString = CGF.Builder.CreateGlobalStringPtr( + llvm::Twine(Pad) + .concat(FD->getType().getAsString()) + .concat(llvm::Twine(' ')) + .concat(FD->getNameAsString()) + .concat(" : ") + .str()); + Value *TmpRes = CGF.Builder.CreateCall(Func, {GString}); + Res = CGF.Builder.CreateAdd(Res, TmpRes); + + QualType CanonicalType = + FD->getType().getUnqualifiedType().getCanonicalType(); + + // We check whether we are in a recursive type + if (CanonicalType->isRecordType()) { + Value *TmpRes = + dumpRecord(CGF, CanonicalType, FieldPtr, Align, Func, Lvl + 1); + Res = CGF.Builder.CreateAdd(TmpRes, Res); + continue; + } + + // We try to determine the best format to print the current field + llvm::Twine Format = Types.find(CanonicalType) == Types.end() + ? Types[Context.VoidPtrTy] + : Types[CanonicalType]; + + Address FieldAddress = Address(FieldPtr, Align); + FieldPtr = CGF.Builder.CreateLoad(FieldAddress); + + // FIXME Need to handle bitfield here + GString = CGF.Builder.CreateGlobalStringPtr( + Format.concat(llvm::Twine('\n')).str()); + TmpRes = CGF.Builder.CreateCall(Func, {GString, FieldPtr}); + Res = CGF.Builder.CreateAdd(Res, TmpRes); + } + + GString = CGF.Builder.CreateGlobalStringPtr(Pad + "}\n"); + Value *TmpRes = CGF.Builder.CreateCall(Func, {GString}); + Res = CGF.Builder.CreateAdd(Res, TmpRes); + return Res; +} + RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue) { @@ -966,6 +1288,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__builtin_copysign: case Builtin::BI__builtin_copysignf: case Builtin::BI__builtin_copysignl: + case Builtin::BI__builtin_copysignf128: return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign)); case Builtin::BIcos: @@ -998,6 +1321,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__builtin_fabs: case Builtin::BI__builtin_fabsf: case Builtin::BI__builtin_fabsl: + case Builtin::BI__builtin_fabsf128: return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs)); case Builtin::BIfloor: @@ -1158,16 +1482,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__builtin_abs: case Builtin::BI__builtin_labs: case Builtin::BI__builtin_llabs: { + // X < 0 ? -X : X + // The negation has 'nsw' because abs of INT_MIN is undefined. Value *ArgValue = EmitScalarExpr(E->getArg(0)); - - Value *NegOp = Builder.CreateNeg(ArgValue, "neg"); - Value *CmpResult = - Builder.CreateICmpSGE(ArgValue, - llvm::Constant::getNullValue(ArgValue->getType()), - "abscond"); - Value *Result = - Builder.CreateSelect(CmpResult, ArgValue, NegOp, "abs"); - + Value *NegOp = Builder.CreateNSWNeg(ArgValue, "neg"); + Constant *Zero = llvm::Constant::getNullValue(ArgValue->getType()); + Value *CmpResult = Builder.CreateICmpSLT(ArgValue, Zero, "abscond"); + Value *Result = Builder.CreateSelect(CmpResult, NegOp, ArgValue, "abs"); return RValue::get(Result); } case Builtin::BI__builtin_conj: @@ -1194,6 +1515,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return RValue::get(ComplexVal.first); } + case Builtin::BI__builtin_dump_struct: { + Value *Func = EmitScalarExpr(E->getArg(1)->IgnoreImpCasts()); + CharUnits Arg0Align = EmitPointerWithAlignment(E->getArg(0)).getAlignment(); + + const Expr *Arg0 = E->getArg(0)->IgnoreImpCasts(); + QualType Arg0Type = Arg0->getType()->getPointeeType(); + + Value *RecordPtr = EmitScalarExpr(Arg0); + Value *Res = dumpRecord(*this, Arg0Type, RecordPtr, Arg0Align, Func, 0); + return RValue::get(Res); + } + case Builtin::BI__builtin_cimag: case Builtin::BI__builtin_cimagf: case Builtin::BI__builtin_cimagl: @@ -1304,20 +1637,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::Type *ArgType = Val->getType(); Shift = Builder.CreateIntCast(Shift, ArgType, false); - unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth(); - Value *ArgTypeSize = llvm::ConstantInt::get(ArgType, ArgWidth); - Value *ArgZero = llvm::Constant::getNullValue(ArgType); - + unsigned ArgWidth = ArgType->getIntegerBitWidth(); Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1); - Shift = Builder.CreateAnd(Shift, Mask); - Value *LeftShift = Builder.CreateSub(ArgTypeSize, Shift); - - Value *RightShifted = Builder.CreateLShr(Val, Shift); - Value *LeftShifted = Builder.CreateShl(Val, LeftShift); - Value *Rotated = Builder.CreateOr(LeftShifted, RightShifted); - Value *ShiftIsZero = Builder.CreateICmpEQ(Shift, ArgZero); - Value *Result = Builder.CreateSelect(ShiftIsZero, Val, Rotated); + Value *RightShiftAmt = Builder.CreateAnd(Shift, Mask); + Value *RightShifted = Builder.CreateLShr(Val, RightShiftAmt); + Value *LeftShiftAmt = Builder.CreateAnd(Builder.CreateNeg(Shift), Mask); + Value *LeftShifted = Builder.CreateShl(Val, LeftShiftAmt); + Value *Result = Builder.CreateOr(LeftShifted, RightShifted); return RValue::get(Result); } case Builtin::BI_rotl8: @@ -1330,20 +1657,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::Type *ArgType = Val->getType(); Shift = Builder.CreateIntCast(Shift, ArgType, false); - unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth(); - Value *ArgTypeSize = llvm::ConstantInt::get(ArgType, ArgWidth); - Value *ArgZero = llvm::Constant::getNullValue(ArgType); - + unsigned ArgWidth = ArgType->getIntegerBitWidth(); Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1); - Shift = Builder.CreateAnd(Shift, Mask); - Value *RightShift = Builder.CreateSub(ArgTypeSize, Shift); - - Value *LeftShifted = Builder.CreateShl(Val, Shift); - Value *RightShifted = Builder.CreateLShr(Val, RightShift); - Value *Rotated = Builder.CreateOr(LeftShifted, RightShifted); - Value *ShiftIsZero = Builder.CreateICmpEQ(Shift, ArgZero); - Value *Result = Builder.CreateSelect(ShiftIsZero, Val, Rotated); + Value *LeftShiftAmt = Builder.CreateAnd(Shift, Mask); + Value *LeftShifted = Builder.CreateShl(Val, LeftShiftAmt); + Value *RightShiftAmt = Builder.CreateAnd(Builder.CreateNeg(Shift), Mask); + Value *RightShifted = Builder.CreateLShr(Val, RightShiftAmt); + Value *Result = Builder.CreateOr(LeftShifted, RightShifted); return RValue::get(Result); } case Builtin::BI__builtin_unpredictable: { @@ -1739,6 +2060,63 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, Builder.CreateMemSet(Dest, ByteVal, SizeVal, false); return RValue::get(Dest.getPointer()); } + case Builtin::BI__builtin_wmemcmp: { + // The MSVC runtime library does not provide a definition of wmemcmp, so we + // need an inline implementation. + if (!getTarget().getTriple().isOSMSVCRT()) + break; + + llvm::Type *WCharTy = ConvertType(getContext().WCharTy); + + Value *Dst = EmitScalarExpr(E->getArg(0)); + Value *Src = EmitScalarExpr(E->getArg(1)); + Value *Size = EmitScalarExpr(E->getArg(2)); + + BasicBlock *Entry = Builder.GetInsertBlock(); + BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt"); + BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt"); + BasicBlock *Next = createBasicBlock("wmemcmp.next"); + BasicBlock *Exit = createBasicBlock("wmemcmp.exit"); + Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0)); + Builder.CreateCondBr(SizeEq0, Exit, CmpGT); + + EmitBlock(CmpGT); + PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2); + DstPhi->addIncoming(Dst, Entry); + PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2); + SrcPhi->addIncoming(Src, Entry); + PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2); + SizePhi->addIncoming(Size, Entry); + CharUnits WCharAlign = + getContext().getTypeAlignInChars(getContext().WCharTy); + Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign); + Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign); + Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh); + Builder.CreateCondBr(DstGtSrc, Exit, CmpLT); + + EmitBlock(CmpLT); + Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh); + Builder.CreateCondBr(DstLtSrc, Exit, Next); + + EmitBlock(Next); + Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1); + Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1); + Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1)); + Value *NextSizeEq0 = + Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0)); + Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT); + DstPhi->addIncoming(NextDst, Next); + SrcPhi->addIncoming(NextSrc, Next); + SizePhi->addIncoming(NextSize, Next); + + EmitBlock(Exit); + PHINode *Ret = Builder.CreatePHI(IntTy, 4); + Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry); + Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT); + Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT); + Ret->addIncoming(ConstantInt::get(IntTy, 0), Next); + return RValue::get(Ret); + } case Builtin::BI__builtin_dwarf_cfa: { // The offset in bytes from the first argument to the CFA. // @@ -2037,7 +2415,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__sync_synchronize: { // We assume this is supposed to correspond to a C++0x-style // sequentially-consistent fence (i.e. this is only usable for - // synchonization, not device I/O or anything like that). This intrinsic + // synchronization, not device I/O or anything like that). This intrinsic // is really badly designed in the sense that in theory, there isn't // any way to safely use it... but in practice, it mostly works // to use it with non-atomic loads and stores to get acquire/release @@ -2552,11 +2930,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__builtin_addressof: return RValue::get(EmitLValue(E->getArg(0)).getPointer()); case Builtin::BI__builtin_operator_new: - return EmitBuiltinNewDeleteCall(FD->getType()->castAs<FunctionProtoType>(), - E->getArg(0), false); + return EmitBuiltinNewDeleteCall( + E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false); case Builtin::BI__builtin_operator_delete: - return EmitBuiltinNewDeleteCall(FD->getType()->castAs<FunctionProtoType>(), - E->getArg(0), true); + return EmitBuiltinNewDeleteCall( + E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true); + case Builtin::BI__noop: // __noop always evaluates to an integer literal zero. return RValue::get(ConstantInt::get(IntTy, 0)); @@ -2643,9 +3022,26 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI_InterlockedXor16: case Builtin::BI_InterlockedXor: return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E)); + + case Builtin::BI_bittest64: + case Builtin::BI_bittest: + case Builtin::BI_bittestandcomplement64: + case Builtin::BI_bittestandcomplement: + case Builtin::BI_bittestandreset64: + case Builtin::BI_bittestandreset: + case Builtin::BI_bittestandset64: + case Builtin::BI_bittestandset: + case Builtin::BI_interlockedbittestandreset: + case Builtin::BI_interlockedbittestandreset64: + case Builtin::BI_interlockedbittestandset64: case Builtin::BI_interlockedbittestandset: - return RValue::get( - EmitMSVCBuiltinExpr(MSVCIntrin::_interlockedbittestandset, E)); + case Builtin::BI_interlockedbittestandset_acq: + case Builtin::BI_interlockedbittestandset_rel: + case Builtin::BI_interlockedbittestandset_nf: + case Builtin::BI_interlockedbittestandreset_acq: + case Builtin::BI_interlockedbittestandreset_rel: + case Builtin::BI_interlockedbittestandreset_nf: + return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E)); case Builtin::BI__exception_code: case Builtin::BI_exception_code: @@ -2656,59 +3052,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__abnormal_termination: case Builtin::BI_abnormal_termination: return RValue::get(EmitSEHAbnormalTermination()); - case Builtin::BI_setjmpex: { - if (getTarget().getTriple().isOSMSVCRT()) { - llvm::Type *ArgTypes[] = {Int8PtrTy, Int8PtrTy}; - llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get( - getLLVMContext(), llvm::AttributeList::FunctionIndex, - llvm::Attribute::ReturnsTwice); - llvm::Constant *SetJmpEx = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/false), - "_setjmpex", ReturnsTwiceAttr, /*Local=*/true); - llvm::Value *Buf = Builder.CreateBitOrPointerCast( - EmitScalarExpr(E->getArg(0)), Int8PtrTy); - llvm::Value *FrameAddr = - Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress), - ConstantInt::get(Int32Ty, 0)); - llvm::Value *Args[] = {Buf, FrameAddr}; - llvm::CallSite CS = EmitRuntimeCallOrInvoke(SetJmpEx, Args); - CS.setAttributes(ReturnsTwiceAttr); - return RValue::get(CS.getInstruction()); - } + case Builtin::BI_setjmpex: + if (getTarget().getTriple().isOSMSVCRT()) + return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E); break; - } - case Builtin::BI_setjmp: { + case Builtin::BI_setjmp: if (getTarget().getTriple().isOSMSVCRT()) { - llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get( - getLLVMContext(), llvm::AttributeList::FunctionIndex, - llvm::Attribute::ReturnsTwice); - llvm::Value *Buf = Builder.CreateBitOrPointerCast( - EmitScalarExpr(E->getArg(0)), Int8PtrTy); - llvm::CallSite CS; - if (getTarget().getTriple().getArch() == llvm::Triple::x86) { - llvm::Type *ArgTypes[] = {Int8PtrTy, IntTy}; - llvm::Constant *SetJmp3 = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/true), - "_setjmp3", ReturnsTwiceAttr, /*Local=*/true); - llvm::Value *Count = ConstantInt::get(IntTy, 0); - llvm::Value *Args[] = {Buf, Count}; - CS = EmitRuntimeCallOrInvoke(SetJmp3, Args); - } else { - llvm::Type *ArgTypes[] = {Int8PtrTy, Int8PtrTy}; - llvm::Constant *SetJmp = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/false), - "_setjmp", ReturnsTwiceAttr, /*Local=*/true); - llvm::Value *FrameAddr = - Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress), - ConstantInt::get(Int32Ty, 0)); - llvm::Value *Args[] = {Buf, FrameAddr}; - CS = EmitRuntimeCallOrInvoke(SetJmp, Args); - } - CS.setAttributes(ReturnsTwiceAttr); - return RValue::get(CS.getInstruction()); + if (getTarget().getTriple().getArch() == llvm::Triple::x86) + return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E); + else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64) + return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E); + return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E); } break; - } case Builtin::BI__GetExceptionInfo: { if (llvm::GlobalVariable *GV = @@ -2736,6 +3092,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return EmitCoroutineIntrinsic(E, Intrinsic::coro_resume); case Builtin::BI__builtin_coro_frame: return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame); + case Builtin::BI__builtin_coro_noop: + return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop); case Builtin::BI__builtin_coro_free: return EmitCoroutineIntrinsic(E, Intrinsic::coro_free); case Builtin::BI__builtin_coro_destroy: @@ -2886,11 +3244,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions case Builtin::BIget_pipe_num_packets: case Builtin::BIget_pipe_max_packets: { - const char *Name; + const char *BaseName; + const PipeType *PipeTy = E->getArg(0)->getType()->getAs<PipeType>(); if (BuiltinID == Builtin::BIget_pipe_num_packets) - Name = "__get_pipe_num_packets"; + BaseName = "__get_pipe_num_packets"; else - Name = "__get_pipe_max_packets"; + BaseName = "__get_pipe_max_packets"; + auto Name = std::string(BaseName) + + std::string(PipeTy->isReadOnly() ? "_ro" : "_wo"); // Building the generic function prototype. Value *Arg0 = EmitScalarExpr(E->getArg(0)); @@ -2996,10 +3357,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return Ptr; }; - // Could have events and/or vaargs. + // Could have events and/or varargs. if (E->getArg(3)->getType()->isBlockPointerType()) { // No events passed, but has variadic arguments. - Name = "__enqueue_kernel_vaargs"; + Name = "__enqueue_kernel_varargs"; auto Info = CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)); llvm::Value *Kernel = @@ -3067,7 +3428,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // Pass the number of variadics to the runtime function too. Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7)); ArgTys.push_back(Int32Ty); - Name = "__enqueue_kernel_events_vaargs"; + Name = "__enqueue_kernel_events_varargs"; auto *PtrToSizeArray = CreateArrayForSizeVar(7); Args.push_back(PtrToSizeArray); @@ -3108,7 +3469,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, CGM.CreateRuntimeFunction( llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy}, false), - "__get_kernel_preferred_work_group_multiple_impl"), + "__get_kernel_preferred_work_group_size_multiple_impl"), {Kernel, Arg})); } case Builtin::BIget_kernel_max_sub_group_size_for_ndrange: @@ -3179,6 +3540,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, case Builtin::BI__xray_customevent: { if (!ShouldXRayInstrumentFunction()) return RValue::getIgnored(); + + if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has( + XRayInstrKind::Custom)) + return RValue::getIgnored(); + if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>()) if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents()) return RValue::getIgnored(); @@ -3202,6 +3568,44 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1})); } + case Builtin::BI__xray_typedevent: { + // TODO: There should be a way to always emit events even if the current + // function is not instrumented. Losing events in a stream can cripple + // a trace. + if (!ShouldXRayInstrumentFunction()) + return RValue::getIgnored(); + + if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has( + XRayInstrKind::Typed)) + return RValue::getIgnored(); + + if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>()) + if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents()) + return RValue::getIgnored(); + + Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent); + auto FTy = F->getFunctionType(); + auto Arg0 = EmitScalarExpr(E->getArg(0)); + auto PTy0 = FTy->getParamType(0); + if (PTy0 != Arg0->getType()) + Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0); + auto Arg1 = E->getArg(1); + auto Arg1Val = EmitScalarExpr(Arg1); + auto Arg1Ty = Arg1->getType(); + auto PTy1 = FTy->getParamType(1); + if (PTy1 != Arg1Val->getType()) { + if (Arg1Ty->isArrayType()) + Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer(); + else + Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1); + } + auto Arg2 = EmitScalarExpr(E->getArg(2)); + auto PTy2 = FTy->getParamType(2); + if (PTy2 != Arg2->getType()) + Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2); + return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2})); + } + case Builtin::BI__builtin_ms_va_start: case Builtin::BI__builtin_ms_va_end: return RValue::get( @@ -3250,6 +3654,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // can move this up to the beginning of the function. checkTargetFeatures(E, FD); + if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID)) + LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth); + // See if we have a target specific intrinsic. const char *Name = getContext().BuiltinInfo.getName(BuiltinID); Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic; @@ -3257,7 +3664,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch()); if (!Prefix.empty()) { IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix.data(), Name); - // NOTE we dont need to perform a compatibility flag check here since the + // NOTE we don't need to perform a compatibility flag check here since the // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the // MS builtins via ALL_MS_LANGUAGES and are filtered earlier. if (IntrinsicID == Intrinsic::not_intrinsic) @@ -3382,7 +3789,7 @@ Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID, static llvm::VectorType *GetNeonType(CodeGenFunction *CGF, NeonTypeFlags TypeFlags, - llvm::Triple::ArchType Arch, + bool HasLegalHalfType=true, bool V1Ty=false) { int IsQuad = TypeFlags.isQuad(); switch (TypeFlags.getEltType()) { @@ -3393,9 +3800,7 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF, case NeonTypeFlags::Poly16: return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); case NeonTypeFlags::Float16: - // FIXME: Only AArch64 backend can so far properly handle half types. - // Remove else part once ARM backend support for half is complete. - if (Arch == llvm::Triple::aarch64) + if (HasLegalHalfType) return llvm::VectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad)); else return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); @@ -3458,7 +3863,7 @@ Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty, return ConstantInt::get(Ty, neg ? -SV : SV); } -// \brief Right-shift a vector by a constant. +// Right-shift a vector by a constant. Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift, llvm::Type *Ty, bool usgn, const char *name) { @@ -3561,13 +3966,24 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP1(vcaleq_v, arm_neon_vacge, 0), NEONMAP1(vcalt_v, arm_neon_vacgt, 0), NEONMAP1(vcaltq_v, arm_neon_vacgt, 0), + NEONMAP0(vceqz_v), + NEONMAP0(vceqzq_v), + NEONMAP0(vcgez_v), + NEONMAP0(vcgezq_v), + NEONMAP0(vcgtz_v), + NEONMAP0(vcgtzq_v), + NEONMAP0(vclez_v), + NEONMAP0(vclezq_v), NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType), NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType), + NEONMAP0(vcltz_v), + NEONMAP0(vcltzq_v), NEONMAP1(vclz_v, ctlz, Add1ArgType), NEONMAP1(vclzq_v, ctlz, Add1ArgType), NEONMAP1(vcnt_v, ctpop, Add1ArgType), NEONMAP1(vcntq_v, ctpop, Add1ArgType), NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0), + NEONMAP0(vcvt_f16_v), NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0), NEONMAP0(vcvt_f32_v), NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), @@ -3587,6 +4003,7 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0), NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0), NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0), + NEONMAP1(vcvta_u16_v, arm_neon_vcvtau, 0), NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0), NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0), @@ -3631,6 +4048,7 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0), NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0), NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0), + NEONMAP0(vcvtq_f16_v), NEONMAP0(vcvtq_f32_v), NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0), @@ -3646,6 +4064,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP0(vcvtq_u16_v), NEONMAP0(vcvtq_u32_v), NEONMAP0(vcvtq_u64_v), + NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0), + NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -3656,18 +4076,30 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts), NEONMAP0(vld1_dup_v), NEONMAP1(vld1_v, arm_neon_vld1, 0), + NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0), + NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0), + NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0), NEONMAP0(vld1q_dup_v), NEONMAP1(vld1q_v, arm_neon_vld1, 0), + NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0), + NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0), + NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0), + NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0), NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0), NEONMAP1(vld2_v, arm_neon_vld2, 0), + NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0), NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0), NEONMAP1(vld2q_v, arm_neon_vld2, 0), + NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0), NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0), NEONMAP1(vld3_v, arm_neon_vld3, 0), + NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0), NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0), NEONMAP1(vld3q_v, arm_neon_vld3, 0), + NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0), NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0), NEONMAP1(vld4_v, arm_neon_vld4, 0), + NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0), NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0), NEONMAP1(vld4q_v, arm_neon_vld4, 0), NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts), @@ -3726,6 +4158,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType), NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType), NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType), + NEONMAP0(vrndi_v), + NEONMAP0(vrndiq_v), NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType), NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType), NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType), @@ -3759,7 +4193,13 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { NEONMAP0(vshrn_n_v), NEONMAP0(vshrq_n_v), NEONMAP1(vst1_v, arm_neon_vst1, 0), + NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0), + NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0), + NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0), NEONMAP1(vst1q_v, arm_neon_vst1, 0), + NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0), + NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0), + NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0), NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0), NEONMAP1(vst2_v, arm_neon_vst2, 0), NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0), @@ -3799,8 +4239,18 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { NEONMAP1(vcaleq_v, aarch64_neon_facge, 0), NEONMAP1(vcalt_v, aarch64_neon_facgt, 0), NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0), + NEONMAP0(vceqz_v), + NEONMAP0(vceqzq_v), + NEONMAP0(vcgez_v), + NEONMAP0(vcgezq_v), + NEONMAP0(vcgtz_v), + NEONMAP0(vcgtzq_v), + NEONMAP0(vclez_v), + NEONMAP0(vclezq_v), NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType), NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType), + NEONMAP0(vcltz_v), + NEONMAP0(vcltzq_v), NEONMAP1(vclz_v, ctlz, Add1ArgType), NEONMAP1(vclzq_v, ctlz, Add1ArgType), NEONMAP1(vcnt_v, ctpop, Add1ArgType), @@ -3830,6 +4280,8 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType), + NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0), + NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0), NEONMAP0(vext_v), NEONMAP0(vextq_v), NEONMAP0(vfma_v), @@ -3838,6 +4290,12 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts), NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts), + NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0), + NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0), + NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0), + NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0), + NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0), + NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0), NEONMAP0(vmovl_v), NEONMAP0(vmovn_v), NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType), @@ -3878,6 +4336,8 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType), NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts), NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts), + NEONMAP0(vrndi_v), + NEONMAP0(vrndiq_v), NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts), NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts), NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts), @@ -3901,6 +4361,12 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { NEONMAP0(vshr_n_v), NEONMAP0(vshrn_n_v), NEONMAP0(vshrq_n_v), + NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0), + NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0), + NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0), + NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0), + NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0), + NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0), NEONMAP0(vsubhn_v), NEONMAP0(vtst_v), NEONMAP0(vtstq_v), @@ -4099,6 +4565,37 @@ static const NeonIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType), NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors), NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType), + // FP16 scalar intrinisics go here. + NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType), + NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType), + NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType), + NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType), + NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType), + NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType), }; #undef NEONMAP0 @@ -4248,8 +4745,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( NeonTypeFlags Type(NeonTypeConst.getZExtValue()); bool Usgn = Type.isUnsigned(); bool Quad = Type.isQuad(); + const bool HasLegalHalfType = getTarget().hasLegalHalfType(); - llvm::VectorType *VTy = GetNeonType(this, Type, Arch); + llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -4314,6 +4812,26 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); return EmitNeonCall(F, Ops, NameHint); } + case NEON::BI__builtin_neon_vceqz_v: + case NEON::BI__builtin_neon_vceqzq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ, + ICmpInst::ICMP_EQ, "vceqz"); + case NEON::BI__builtin_neon_vcgez_v: + case NEON::BI__builtin_neon_vcgezq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE, + ICmpInst::ICMP_SGE, "vcgez"); + case NEON::BI__builtin_neon_vclez_v: + case NEON::BI__builtin_neon_vclezq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE, + ICmpInst::ICMP_SLE, "vclez"); + case NEON::BI__builtin_neon_vcgtz_v: + case NEON::BI__builtin_neon_vcgtzq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT, + ICmpInst::ICMP_SGT, "vcgtz"); + case NEON::BI__builtin_neon_vcltz_v: + case NEON::BI__builtin_neon_vcltzq_v: + return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT, + ICmpInst::ICMP_SLT, "vcltz"); case NEON::BI__builtin_neon_vclz_v: case NEON::BI__builtin_neon_vclzq_v: // We generate target-independent intrinsic, which needs a second argument @@ -4323,13 +4841,15 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vcvt_f32_v: case NEON::BI__builtin_neon_vcvtq_f32_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad), Arch); + Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad), + HasLegalHalfType); return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); case NEON::BI__builtin_neon_vcvt_f16_v: case NEON::BI__builtin_neon_vcvtq_f16_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad), Arch); + Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad), + HasLegalHalfType); return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); case NEON::BI__builtin_neon_vcvt_n_f16_v: @@ -4378,6 +4898,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vcvta_s16_v: case NEON::BI__builtin_neon_vcvta_s32_v: case NEON::BI__builtin_neon_vcvta_s64_v: + case NEON::BI__builtin_neon_vcvta_u16_v: case NEON::BI__builtin_neon_vcvta_u32_v: case NEON::BI__builtin_neon_vcvta_u64_v: case NEON::BI__builtin_neon_vcvtaq_s16_v: @@ -4452,12 +4973,33 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( Ops.push_back(getAlignmentValue32(PtrOp0)); return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1"); } + case NEON::BI__builtin_neon_vld1_x2_v: + case NEON::BI__builtin_neon_vld1q_x2_v: + case NEON::BI__builtin_neon_vld1_x3_v: + case NEON::BI__builtin_neon_vld1q_x3_v: + case NEON::BI__builtin_neon_vld1_x4_v: + case NEON::BI__builtin_neon_vld1q_x4_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType()); + Ops[1] = Builder.CreateBitCast(Ops[1], PTy); + llvm::Type *Tys[2] = { VTy, PTy }; + Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); + Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN"); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); + } case NEON::BI__builtin_neon_vld2_v: case NEON::BI__builtin_neon_vld2q_v: case NEON::BI__builtin_neon_vld3_v: case NEON::BI__builtin_neon_vld3q_v: case NEON::BI__builtin_neon_vld4_v: - case NEON::BI__builtin_neon_vld4q_v: { + case NEON::BI__builtin_neon_vld4q_v: + case NEON::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld2q_dup_v: + case NEON::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld3q_dup_v: + case NEON::BI__builtin_neon_vld4_dup_v: + case NEON::BI__builtin_neon_vld4q_dup_v: { llvm::Type *Tys[] = {Ty, Int8PtrTy}; Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys); Value *Align = getAlignmentValue32(PtrOp1); @@ -4556,7 +5098,10 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vrsqrteq_v: Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint); - + case NEON::BI__builtin_neon_vrndi_v: + case NEON::BI__builtin_neon_vrndiq_v: + Int = Intrinsic::nearbyint; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint); case NEON::BI__builtin_neon_vrshr_n_v: case NEON::BI__builtin_neon_vrshrq_n_v: return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n", @@ -4607,6 +5152,23 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( Ops.push_back(getAlignmentValue32(PtrOp0)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, ""); } + case NEON::BI__builtin_neon_vst1_x2_v: + case NEON::BI__builtin_neon_vst1q_x2_v: + case NEON::BI__builtin_neon_vst1_x3_v: + case NEON::BI__builtin_neon_vst1q_x3_v: + case NEON::BI__builtin_neon_vst1_x4_v: + case NEON::BI__builtin_neon_vst1q_x4_v: { + llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType()); + // TODO: Currently in AArch32 mode the pointer operand comes first, whereas + // in AArch64 it comes last. We may want to stick to one or another. + if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) { + llvm::Type *Tys[2] = { VTy, PTy }; + std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); + return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, ""); + } + llvm::Type *Tys[2] = { PTy, VTy }; + return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, ""); + } case NEON::BI__builtin_neon_vsubhn_v: { llvm::VectorType *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy); @@ -4689,6 +5251,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( } return SV; } + case NEON::BI__builtin_neon_vdot_v: + case NEON::BI__builtin_neon_vdotq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot"); + } } assert(Int && "Expected valid intrinsic number"); @@ -4897,6 +5467,34 @@ static bool HasExtraNeonArgument(unsigned BuiltinID) { return true; } +Value *CodeGenFunction::EmitISOVolatileLoad(const CallExpr *E) { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + QualType ElTy = E->getArg(0)->getType()->getPointeeType(); + CharUnits LoadSize = getContext().getTypeSizeInChars(ElTy); + llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(), + LoadSize.getQuantity() * 8); + Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo()); + llvm::LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, LoadSize); + Load->setVolatile(true); + return Load; +} + +Value *CodeGenFunction::EmitISOVolatileStore(const CallExpr *E) { + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Value = EmitScalarExpr(E->getArg(1)); + QualType ElTy = E->getArg(0)->getType()->getPointeeType(); + CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy); + llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(), + StoreSize.getQuantity() * 8); + Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo()); + llvm::StoreInst *Store = + Builder.CreateAlignedStore(Value, Ptr, + StoreSize); + Store->setVolatile(true); + return Store; +} + Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E, llvm::Triple::ArchType Arch) { @@ -5139,35 +5737,13 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, case ARM::BI__iso_volatile_load8: case ARM::BI__iso_volatile_load16: case ARM::BI__iso_volatile_load32: - case ARM::BI__iso_volatile_load64: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - QualType ElTy = E->getArg(0)->getType()->getPointeeType(); - CharUnits LoadSize = getContext().getTypeSizeInChars(ElTy); - llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(), - LoadSize.getQuantity() * 8); - Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo()); - llvm::LoadInst *Load = - Builder.CreateAlignedLoad(Ptr, LoadSize); - Load->setVolatile(true); - return Load; - } + case ARM::BI__iso_volatile_load64: + return EmitISOVolatileLoad(E); case ARM::BI__iso_volatile_store8: case ARM::BI__iso_volatile_store16: case ARM::BI__iso_volatile_store32: - case ARM::BI__iso_volatile_store64: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - Value *Value = EmitScalarExpr(E->getArg(1)); - QualType ElTy = E->getArg(0)->getType()->getPointeeType(); - CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy); - llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(), - StoreSize.getQuantity() * 8); - Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo()); - llvm::StoreInst *Store = - Builder.CreateAlignedStore(Value, Ptr, - StoreSize); - Store->setVolatile(true); - return Store; - } + case ARM::BI__iso_volatile_store64: + return EmitISOVolatileStore(E); } if (BuiltinID == ARM::BI__builtin_arm_clrex) { @@ -5312,8 +5888,11 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vld4_lane_v: case NEON::BI__builtin_neon_vld4q_lane_v: case NEON::BI__builtin_neon_vld2_dup_v: + case NEON::BI__builtin_neon_vld2q_dup_v: case NEON::BI__builtin_neon_vld3_dup_v: + case NEON::BI__builtin_neon_vld3q_dup_v: case NEON::BI__builtin_neon_vld4_dup_v: + case NEON::BI__builtin_neon_vld4q_dup_v: // Get the alignment for the argument in addition to the value; // we'll use it later. PtrOp1 = EmitPointerWithAlignment(E->getArg(1)); @@ -5349,6 +5928,12 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vgetq_lane_f32: return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane"); + case NEON::BI__builtin_neon_vrndns_f32: { + Value *Arg = EmitScalarExpr(E->getArg(0)); + llvm::Type *Tys[] = {Arg->getType()}; + Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys); + return Builder.CreateCall(F, {Arg}, "vrndn"); } + case NEON::BI__builtin_neon_vset_lane_i8: case NEON::BI__builtin_neon_vset_lane_i16: case NEON::BI__builtin_neon_vset_lane_i32: @@ -5438,7 +6023,8 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, bool usgn = Type.isUnsigned(); bool rightShift = false; - llvm::VectorType *VTy = GetNeonType(this, Type, Arch); + llvm::VectorType *VTy = GetNeonType(this, Type, + getTarget().hasLegalHalfType()); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -5483,68 +6069,6 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, Value *Ld = Builder.CreateLoad(PtrOp0); return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane"); } - case NEON::BI__builtin_neon_vld2_dup_v: - case NEON::BI__builtin_neon_vld3_dup_v: - case NEON::BI__builtin_neon_vld4_dup_v: { - // Handle 64-bit elements as a special-case. There is no "dup" needed. - if (VTy->getElementType()->getPrimitiveSizeInBits() == 64) { - switch (BuiltinID) { - case NEON::BI__builtin_neon_vld2_dup_v: - Int = Intrinsic::arm_neon_vld2; - break; - case NEON::BI__builtin_neon_vld3_dup_v: - Int = Intrinsic::arm_neon_vld3; - break; - case NEON::BI__builtin_neon_vld4_dup_v: - Int = Intrinsic::arm_neon_vld4; - break; - default: llvm_unreachable("unknown vld_dup intrinsic?"); - } - llvm::Type *Tys[] = {Ty, Int8PtrTy}; - Function *F = CGM.getIntrinsic(Int, Tys); - llvm::Value *Align = getAlignmentValue32(PtrOp1); - Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, "vld_dup"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); - } - switch (BuiltinID) { - case NEON::BI__builtin_neon_vld2_dup_v: - Int = Intrinsic::arm_neon_vld2lane; - break; - case NEON::BI__builtin_neon_vld3_dup_v: - Int = Intrinsic::arm_neon_vld3lane; - break; - case NEON::BI__builtin_neon_vld4_dup_v: - Int = Intrinsic::arm_neon_vld4lane; - break; - default: llvm_unreachable("unknown vld_dup intrinsic?"); - } - llvm::Type *Tys[] = {Ty, Int8PtrTy}; - Function *F = CGM.getIntrinsic(Int, Tys); - llvm::StructType *STy = cast<llvm::StructType>(F->getReturnType()); - - SmallVector<Value*, 6> Args; - Args.push_back(Ops[1]); - Args.append(STy->getNumElements(), UndefValue::get(Ty)); - - llvm::Constant *CI = ConstantInt::get(Int32Ty, 0); - Args.push_back(CI); - Args.push_back(getAlignmentValue32(PtrOp1)); - - Ops[1] = Builder.CreateCall(F, Args, "vld_dup"); - // splat lane 0 to all elts in each vector of the result. - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Value *Val = Builder.CreateExtractValue(Ops[1], i); - Value *Elt = Builder.CreateBitCast(Val, Ty); - Elt = EmitNeonSplat(Elt, CI); - Elt = Builder.CreateBitCast(Elt, Val->getType()); - Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i); - } - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); - } case NEON::BI__builtin_neon_vqrshrn_n_v: Int = usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns; @@ -5684,7 +6208,7 @@ static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags Type(Result.getZExtValue()); - llvm::VectorType *Ty = GetNeonType(&CGF, Type, Arch); + llvm::VectorType *Ty = GetNeonType(&CGF, Type); if (!Ty) return nullptr; @@ -5803,18 +6327,23 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, HintID = 0; break; case AArch64::BI__builtin_arm_yield: + case AArch64::BI__yield: HintID = 1; break; case AArch64::BI__builtin_arm_wfe: + case AArch64::BI__wfe: HintID = 2; break; case AArch64::BI__builtin_arm_wfi: + case AArch64::BI__wfi: HintID = 3; break; case AArch64::BI__builtin_arm_sev: + case AArch64::BI__sev: HintID = 4; break; case AArch64::BI__builtin_arm_sevl: + case AArch64::BI__sevl: HintID = 5; break; } @@ -6081,6 +6610,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, // Handle non-overloaded intrinsics first. switch (BuiltinID) { default: break; + case NEON::BI__builtin_neon_vabsh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs"); case NEON::BI__builtin_neon_vldrq_p128: { llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128); llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0); @@ -6123,6 +6655,153 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateUIToFP(Ops[0], FTy); return Builder.CreateSIToFP(Ops[0], FTy); } + case NEON::BI__builtin_neon_vcvth_f16_u16: + case NEON::BI__builtin_neon_vcvth_f16_u32: + case NEON::BI__builtin_neon_vcvth_f16_u64: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_f16_s16: + case NEON::BI__builtin_neon_vcvth_f16_s32: + case NEON::BI__builtin_neon_vcvth_f16_s64: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + llvm::Type *FTy = HalfTy; + llvm::Type *InTy; + if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64) + InTy = Int64Ty; + else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32) + InTy = Int32Ty; + else + InTy = Int16Ty; + Ops[0] = Builder.CreateBitCast(Ops[0], InTy); + if (usgn) + return Builder.CreateUIToFP(Ops[0], FTy); + return Builder.CreateSIToFP(Ops[0], FTy); + } + case NEON::BI__builtin_neon_vcvth_u16_f16: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_s16_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Int16Ty); + return Builder.CreateFPToSI(Ops[0], Int16Ty); + } + case NEON::BI__builtin_neon_vcvth_u32_f16: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_s32_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Int32Ty); + return Builder.CreateFPToSI(Ops[0], Int32Ty); + } + case NEON::BI__builtin_neon_vcvth_u64_f16: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_s64_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Int64Ty); + return Builder.CreateFPToSI(Ops[0], Int64Ty); + } + case NEON::BI__builtin_neon_vcvtah_u16_f16: + case NEON::BI__builtin_neon_vcvtmh_u16_f16: + case NEON::BI__builtin_neon_vcvtnh_u16_f16: + case NEON::BI__builtin_neon_vcvtph_u16_f16: + case NEON::BI__builtin_neon_vcvtah_s16_f16: + case NEON::BI__builtin_neon_vcvtmh_s16_f16: + case NEON::BI__builtin_neon_vcvtnh_s16_f16: + case NEON::BI__builtin_neon_vcvtph_s16_f16: { + unsigned Int; + llvm::Type* InTy = Int32Ty; + llvm::Type* FTy = HalfTy; + llvm::Type *Tys[2] = {InTy, FTy}; + Ops.push_back(EmitScalarExpr(E->getArg(0))); + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vcvtah_u16_f16: + Int = Intrinsic::aarch64_neon_fcvtau; break; + case NEON::BI__builtin_neon_vcvtmh_u16_f16: + Int = Intrinsic::aarch64_neon_fcvtmu; break; + case NEON::BI__builtin_neon_vcvtnh_u16_f16: + Int = Intrinsic::aarch64_neon_fcvtnu; break; + case NEON::BI__builtin_neon_vcvtph_u16_f16: + Int = Intrinsic::aarch64_neon_fcvtpu; break; + case NEON::BI__builtin_neon_vcvtah_s16_f16: + Int = Intrinsic::aarch64_neon_fcvtas; break; + case NEON::BI__builtin_neon_vcvtmh_s16_f16: + Int = Intrinsic::aarch64_neon_fcvtms; break; + case NEON::BI__builtin_neon_vcvtnh_s16_f16: + Int = Intrinsic::aarch64_neon_fcvtns; break; + case NEON::BI__builtin_neon_vcvtph_s16_f16: + Int = Intrinsic::aarch64_neon_fcvtps; break; + } + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt"); + return Builder.CreateTrunc(Ops[0], Int16Ty); + } + case NEON::BI__builtin_neon_vcaleh_f16: + case NEON::BI__builtin_neon_vcalth_f16: + case NEON::BI__builtin_neon_vcageh_f16: + case NEON::BI__builtin_neon_vcagth_f16: { + unsigned Int; + llvm::Type* InTy = Int32Ty; + llvm::Type* FTy = HalfTy; + llvm::Type *Tys[2] = {InTy, FTy}; + Ops.push_back(EmitScalarExpr(E->getArg(1))); + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vcageh_f16: + Int = Intrinsic::aarch64_neon_facge; break; + case NEON::BI__builtin_neon_vcagth_f16: + Int = Intrinsic::aarch64_neon_facgt; break; + case NEON::BI__builtin_neon_vcaleh_f16: + Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break; + case NEON::BI__builtin_neon_vcalth_f16: + Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break; + } + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg"); + return Builder.CreateTrunc(Ops[0], Int16Ty); + } + case NEON::BI__builtin_neon_vcvth_n_s16_f16: + case NEON::BI__builtin_neon_vcvth_n_u16_f16: { + unsigned Int; + llvm::Type* InTy = Int32Ty; + llvm::Type* FTy = HalfTy; + llvm::Type *Tys[2] = {InTy, FTy}; + Ops.push_back(EmitScalarExpr(E->getArg(1))); + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vcvth_n_s16_f16: + Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break; + case NEON::BI__builtin_neon_vcvth_n_u16_f16: + Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break; + } + Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n"); + return Builder.CreateTrunc(Ops[0], Int16Ty); + } + case NEON::BI__builtin_neon_vcvth_n_f16_s16: + case NEON::BI__builtin_neon_vcvth_n_f16_u16: { + unsigned Int; + llvm::Type* FTy = HalfTy; + llvm::Type* InTy = Int32Ty; + llvm::Type *Tys[2] = {FTy, InTy}; + Ops.push_back(EmitScalarExpr(E->getArg(1))); + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vcvth_n_f16_s16: + Int = Intrinsic::aarch64_neon_vcvtfxs2fp; + Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext"); + break; + case NEON::BI__builtin_neon_vcvth_n_f16_u16: + Int = Intrinsic::aarch64_neon_vcvtfxu2fp; + Ops[0] = Builder.CreateZExt(Ops[0], InTy); + break; + } + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n"); + } case NEON::BI__builtin_neon_vpaddd_s64: { llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2); Value *Vec = EmitScalarExpr(E->getArg(0)); @@ -6164,6 +6843,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vceqzd_s64: case NEON::BI__builtin_neon_vceqzd_f64: case NEON::BI__builtin_neon_vceqzs_f32: + case NEON::BI__builtin_neon_vceqzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6171,6 +6851,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vcgezd_s64: case NEON::BI__builtin_neon_vcgezd_f64: case NEON::BI__builtin_neon_vcgezs_f32: + case NEON::BI__builtin_neon_vcgezh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6178,6 +6859,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vclezd_s64: case NEON::BI__builtin_neon_vclezd_f64: case NEON::BI__builtin_neon_vclezs_f32: + case NEON::BI__builtin_neon_vclezh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6185,6 +6867,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vcgtzd_s64: case NEON::BI__builtin_neon_vcgtzd_f64: case NEON::BI__builtin_neon_vcgtzs_f32: + case NEON::BI__builtin_neon_vcgtzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6192,6 +6875,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vcltzd_s64: case NEON::BI__builtin_neon_vcltzd_f64: case NEON::BI__builtin_neon_vcltzs_f32: + case NEON::BI__builtin_neon_vcltzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6244,6 +6928,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd"); } + case NEON::BI__builtin_neon_vceqh_f16: + case NEON::BI__builtin_neon_vcleh_f16: + case NEON::BI__builtin_neon_vclth_f16: + case NEON::BI__builtin_neon_vcgeh_f16: + case NEON::BI__builtin_neon_vcgth_f16: { + llvm::CmpInst::Predicate P; + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break; + case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break; + case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break; + case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break; + case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break; + } + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy); + Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); + return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd"); + } case NEON::BI__builtin_neon_vceqd_s64: case NEON::BI__builtin_neon_vceqd_u64: case NEON::BI__builtin_neon_vcgtd_s64: @@ -6381,6 +7085,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, llvm::VectorType::get(DoubleTy, 2)); return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); + case NEON::BI__builtin_neon_vaddh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh"); + case NEON::BI__builtin_neon_vsubh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFSub(Ops[0], Ops[1], "vsubh"); + case NEON::BI__builtin_neon_vmulh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFMul(Ops[0], Ops[1], "vmulh"); + case NEON::BI__builtin_neon_vdivh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh"); + case NEON::BI__builtin_neon_vfmah_f16: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy); + // NEON intrinsic puts accumulator first, unlike the LLVM fma. + return Builder.CreateCall(F, + {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]}); + } + case NEON::BI__builtin_neon_vfmsh_f16: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy); + Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy); + Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh"); + // NEON intrinsic puts accumulator first, unlike the LLVM fma. + return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]}); + } case NEON::BI__builtin_neon_vaddd_s64: case NEON::BI__builtin_neon_vaddd_u64: return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd"); @@ -6538,7 +7267,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } } - llvm::VectorType *VTy = GetNeonType(this, Type, Arch); + llvm::VectorType *VTy = GetNeonType(this, Type); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -6603,7 +7332,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, true), Arch); + NeonTypeFlags(NeonTypeFlags::Float64, false, true)); Ops[2] = Builder.CreateBitCast(Ops[2], VTy); Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy); @@ -6655,12 +7384,22 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax; if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax"); + case NEON::BI__builtin_neon_vmaxh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fmax; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax"); + } case NEON::BI__builtin_neon_vmin_v: case NEON::BI__builtin_neon_vminq_v: // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin; if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin"); + case NEON::BI__builtin_neon_vminh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fmin; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin"); + } case NEON::BI__builtin_neon_vabd_v: case NEON::BI__builtin_neon_vabdq_v: // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. @@ -6699,20 +7438,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vminnmq_v: Int = Intrinsic::aarch64_neon_fminnm; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm"); + case NEON::BI__builtin_neon_vminnmh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fminnm; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm"); case NEON::BI__builtin_neon_vmaxnm_v: case NEON::BI__builtin_neon_vmaxnmq_v: Int = Intrinsic::aarch64_neon_fmaxnm; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm"); + case NEON::BI__builtin_neon_vmaxnmh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fmaxnm; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm"); case NEON::BI__builtin_neon_vrecpss_f32: { Ops.push_back(EmitScalarExpr(E->getArg(1))); return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy), Ops, "vrecps"); } - case NEON::BI__builtin_neon_vrecpsd_f64: { + case NEON::BI__builtin_neon_vrecpsd_f64: Ops.push_back(EmitScalarExpr(E->getArg(1))); return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy), Ops, "vrecps"); - } + case NEON::BI__builtin_neon_vrecpsh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy), + Ops, "vrecps"); case NEON::BI__builtin_neon_vqshrun_n_v: Int = Intrinsic::aarch64_neon_sqshrun; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n"); @@ -6728,72 +7478,87 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vqrshrn_n_v: Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n"); + case NEON::BI__builtin_neon_vrndah_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::round; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda"); + } case NEON::BI__builtin_neon_vrnda_v: case NEON::BI__builtin_neon_vrndaq_v: { Int = Intrinsic::round; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda"); } - case NEON::BI__builtin_neon_vrndi_v: - case NEON::BI__builtin_neon_vrndiq_v: { + case NEON::BI__builtin_neon_vrndih_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); Int = Intrinsic::nearbyint; - return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi"); + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi"); + } + case NEON::BI__builtin_neon_vrndmh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::floor; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm"); } case NEON::BI__builtin_neon_vrndm_v: case NEON::BI__builtin_neon_vrndmq_v: { Int = Intrinsic::floor; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm"); } + case NEON::BI__builtin_neon_vrndnh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::aarch64_neon_frintn; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn"); + } case NEON::BI__builtin_neon_vrndn_v: case NEON::BI__builtin_neon_vrndnq_v: { Int = Intrinsic::aarch64_neon_frintn; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn"); } + case NEON::BI__builtin_neon_vrndns_f32: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::aarch64_neon_frintn; + return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn"); + } + case NEON::BI__builtin_neon_vrndph_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::ceil; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp"); + } case NEON::BI__builtin_neon_vrndp_v: case NEON::BI__builtin_neon_vrndpq_v: { Int = Intrinsic::ceil; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp"); } + case NEON::BI__builtin_neon_vrndxh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::rint; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx"); + } case NEON::BI__builtin_neon_vrndx_v: case NEON::BI__builtin_neon_vrndxq_v: { Int = Intrinsic::rint; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx"); } + case NEON::BI__builtin_neon_vrndh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::trunc; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz"); + } case NEON::BI__builtin_neon_vrnd_v: case NEON::BI__builtin_neon_vrndq_v: { Int = Intrinsic::trunc; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz"); } - case NEON::BI__builtin_neon_vceqz_v: - case NEON::BI__builtin_neon_vceqzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ, - ICmpInst::ICMP_EQ, "vceqz"); - case NEON::BI__builtin_neon_vcgez_v: - case NEON::BI__builtin_neon_vcgezq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE, - ICmpInst::ICMP_SGE, "vcgez"); - case NEON::BI__builtin_neon_vclez_v: - case NEON::BI__builtin_neon_vclezq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE, - ICmpInst::ICMP_SLE, "vclez"); - case NEON::BI__builtin_neon_vcgtz_v: - case NEON::BI__builtin_neon_vcgtzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT, - ICmpInst::ICMP_SGT, "vcgtz"); - case NEON::BI__builtin_neon_vcltz_v: - case NEON::BI__builtin_neon_vcltzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT, - ICmpInst::ICMP_SLT, "vcltz"); case NEON::BI__builtin_neon_vcvt_f64_v: case NEON::BI__builtin_neon_vcvtq_f64_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad), Arch); + Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad)); return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); case NEON::BI__builtin_neon_vcvt_f64_f32: { assert(Type.getEltType() == NeonTypeFlags::Float64 && quad && "unexpected vcvt_f64_f32 builtin"); NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false); - Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag, Arch)); + Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag)); return Builder.CreateFPExt(Ops[0], Ty, "vcvt"); } @@ -6801,7 +7566,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, assert(Type.getEltType() == NeonTypeFlags::Float32 && "unexpected vcvt_f32_f64 builtin"); NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true); - Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag, Arch)); + Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag)); return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt"); } @@ -6809,20 +7574,21 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vcvt_u32_v: case NEON::BI__builtin_neon_vcvt_s64_v: case NEON::BI__builtin_neon_vcvt_u64_v: - case NEON::BI__builtin_neon_vcvt_s16_v: - case NEON::BI__builtin_neon_vcvt_u16_v: + case NEON::BI__builtin_neon_vcvt_s16_v: + case NEON::BI__builtin_neon_vcvt_u16_v: case NEON::BI__builtin_neon_vcvtq_s32_v: case NEON::BI__builtin_neon_vcvtq_u32_v: case NEON::BI__builtin_neon_vcvtq_s64_v: case NEON::BI__builtin_neon_vcvtq_u64_v: - case NEON::BI__builtin_neon_vcvtq_s16_v: - case NEON::BI__builtin_neon_vcvtq_u16_v: { + case NEON::BI__builtin_neon_vcvtq_s16_v: + case NEON::BI__builtin_neon_vcvtq_u16_v: { Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type)); if (usgn) return Builder.CreateFPToUI(Ops[0], Ty); return Builder.CreateFPToSI(Ops[0], Ty); } case NEON::BI__builtin_neon_vcvta_s16_v: + case NEON::BI__builtin_neon_vcvta_u16_v: case NEON::BI__builtin_neon_vcvta_s32_v: case NEON::BI__builtin_neon_vcvtaq_s16_v: case NEON::BI__builtin_neon_vcvtaq_s32_v: @@ -6890,6 +7656,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_fmulx; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx"); } + case NEON::BI__builtin_neon_vmulxh_lane_f16: + case NEON::BI__builtin_neon_vmulxh_laneq_f16: { + // vmulx_lane should be mapped to Neon scalar mulx after + // extracting the scalar element + Ops.push_back(EmitScalarExpr(E->getArg(2))); + Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); + Ops.pop_back(); + Int = Intrinsic::aarch64_neon_fmulx; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx"); + } case NEON::BI__builtin_neon_vmul_lane_v: case NEON::BI__builtin_neon_vmul_laneq_v: { // v1f64 vmul_lane should be mapped to Neon scalar mul lane @@ -6898,7 +7674,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Quad = true; Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, Quad), Arch); + NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); Ops[1] = Builder.CreateBitCast(Ops[1], VTy); Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); Value *Result = Builder.CreateFMul(Ops[0], Ops[1]); @@ -6906,6 +7682,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } case NEON::BI__builtin_neon_vnegd_s64: return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd"); + case NEON::BI__builtin_neon_vnegh_f16: + return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh"); case NEON::BI__builtin_neon_vpmaxnm_v: case NEON::BI__builtin_neon_vpmaxnmq_v: { Int = Intrinsic::aarch64_neon_fmaxnmp; @@ -6916,6 +7694,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_fminnmp; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm"); } + case NEON::BI__builtin_neon_vsqrth_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::sqrt; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt"); + } case NEON::BI__builtin_neon_vsqrt_v: case NEON::BI__builtin_neon_vsqrtq_v: { Int = Intrinsic::sqrt; @@ -7293,64 +8076,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateBitCast(Ops[0], VTy); return Builder.CreateAdd(Ops[0], tmp); } - // FIXME: Sharing loads & stores with 32-bit is complicated by the absence - // of an Align parameter here. - case NEON::BI__builtin_neon_vld1_x2_v: - case NEON::BI__builtin_neon_vld1q_x2_v: - case NEON::BI__builtin_neon_vld1_x3_v: - case NEON::BI__builtin_neon_vld1q_x3_v: - case NEON::BI__builtin_neon_vld1_x4_v: - case NEON::BI__builtin_neon_vld1q_x4_v: { - llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType()); - Ops[1] = Builder.CreateBitCast(Ops[1], PTy); - llvm::Type *Tys[2] = { VTy, PTy }; - unsigned Int; - switch (BuiltinID) { - case NEON::BI__builtin_neon_vld1_x2_v: - case NEON::BI__builtin_neon_vld1q_x2_v: - Int = Intrinsic::aarch64_neon_ld1x2; - break; - case NEON::BI__builtin_neon_vld1_x3_v: - case NEON::BI__builtin_neon_vld1q_x3_v: - Int = Intrinsic::aarch64_neon_ld1x3; - break; - case NEON::BI__builtin_neon_vld1_x4_v: - case NEON::BI__builtin_neon_vld1q_x4_v: - Int = Intrinsic::aarch64_neon_ld1x4; - break; - } - Function *F = CGM.getIntrinsic(Int, Tys); - Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN"); - Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); - Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); - } - case NEON::BI__builtin_neon_vst1_x2_v: - case NEON::BI__builtin_neon_vst1q_x2_v: - case NEON::BI__builtin_neon_vst1_x3_v: - case NEON::BI__builtin_neon_vst1q_x3_v: - case NEON::BI__builtin_neon_vst1_x4_v: - case NEON::BI__builtin_neon_vst1q_x4_v: { - llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType()); - llvm::Type *Tys[2] = { VTy, PTy }; - unsigned Int; - switch (BuiltinID) { - case NEON::BI__builtin_neon_vst1_x2_v: - case NEON::BI__builtin_neon_vst1q_x2_v: - Int = Intrinsic::aarch64_neon_st1x2; - break; - case NEON::BI__builtin_neon_vst1_x3_v: - case NEON::BI__builtin_neon_vst1q_x3_v: - Int = Intrinsic::aarch64_neon_st1x3; - break; - case NEON::BI__builtin_neon_vst1_x4_v: - case NEON::BI__builtin_neon_vst1q_x4_v: - Int = Intrinsic::aarch64_neon_st1x4; - break; - } - std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end()); - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, ""); - } case NEON::BI__builtin_neon_vld1_v: case NEON::BI__builtin_neon_vld1q_v: { Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy)); @@ -7657,6 +8382,38 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_suqadd; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); } + case AArch64::BI__iso_volatile_load8: + case AArch64::BI__iso_volatile_load16: + case AArch64::BI__iso_volatile_load32: + case AArch64::BI__iso_volatile_load64: + return EmitISOVolatileLoad(E); + case AArch64::BI__iso_volatile_store8: + case AArch64::BI__iso_volatile_store16: + case AArch64::BI__iso_volatile_store32: + case AArch64::BI__iso_volatile_store64: + return EmitISOVolatileStore(E); + case AArch64::BI_BitScanForward: + case AArch64::BI_BitScanForward64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E); + case AArch64::BI_BitScanReverse: + case AArch64::BI_BitScanReverse64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E); + case AArch64::BI_InterlockedAnd64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E); + case AArch64::BI_InterlockedExchange64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E); + case AArch64::BI_InterlockedExchangeAdd64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E); + case AArch64::BI_InterlockedExchangeSub64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E); + case AArch64::BI_InterlockedOr64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E); + case AArch64::BI_InterlockedXor64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E); + case AArch64::BI_InterlockedDecrement64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E); + case AArch64::BI_InterlockedIncrement64: + return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E); } } @@ -7708,42 +8465,66 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, } static Value *EmitX86MaskedStore(CodeGenFunction &CGF, - SmallVectorImpl<Value *> &Ops, + ArrayRef<Value *> Ops, unsigned Align) { // Cast the pointer to right type. - Ops[0] = CGF.Builder.CreateBitCast(Ops[0], + Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType())); - // If the mask is all ones just emit a regular store. - if (const auto *C = dyn_cast<Constant>(Ops[2])) - if (C->isAllOnesValue()) - return CGF.Builder.CreateAlignedStore(Ops[1], Ops[0], Align); - Value *MaskVec = getMaskVecValue(CGF, Ops[2], Ops[1]->getType()->getVectorNumElements()); - return CGF.Builder.CreateMaskedStore(Ops[1], Ops[0], Align, MaskVec); + return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Align, MaskVec); } static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, - SmallVectorImpl<Value *> &Ops, unsigned Align) { + ArrayRef<Value *> Ops, unsigned Align) { // Cast the pointer to right type. - Ops[0] = CGF.Builder.CreateBitCast(Ops[0], + Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType())); - // If the mask is all ones just emit a regular store. - if (const auto *C = dyn_cast<Constant>(Ops[2])) - if (C->isAllOnesValue()) - return CGF.Builder.CreateAlignedLoad(Ops[0], Align); - Value *MaskVec = getMaskVecValue(CGF, Ops[2], Ops[1]->getType()->getVectorNumElements()); - return CGF.Builder.CreateMaskedLoad(Ops[0], Align, MaskVec, Ops[1]); + return CGF.Builder.CreateMaskedLoad(Ptr, Align, MaskVec, Ops[1]); +} + +static Value *EmitX86ExpandLoad(CodeGenFunction &CGF, + ArrayRef<Value *> Ops) { + llvm::Type *ResultTy = Ops[1]->getType(); + llvm::Type *PtrTy = ResultTy->getVectorElementType(); + + // Cast the pointer to element type. + Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(PtrTy)); + + Value *MaskVec = getMaskVecValue(CGF, Ops[2], + ResultTy->getVectorNumElements()); + + llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload, + ResultTy); + return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] }); +} + +static Value *EmitX86CompressStore(CodeGenFunction &CGF, + ArrayRef<Value *> Ops) { + llvm::Type *ResultTy = Ops[1]->getType(); + llvm::Type *PtrTy = ResultTy->getVectorElementType(); + + // Cast the pointer to element type. + Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], + llvm::PointerType::getUnqual(PtrTy)); + + Value *MaskVec = getMaskVecValue(CGF, Ops[2], + ResultTy->getVectorNumElements()); + + llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore, + ResultTy); + return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec }); } static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc, - unsigned NumElts, SmallVectorImpl<Value *> &Ops, + unsigned NumElts, ArrayRef<Value *> Ops, bool InvertLHS = false) { Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts); Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts); @@ -7755,26 +8536,6 @@ static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc, CGF.Builder.getIntNTy(std::max(NumElts, 8U))); } -static Value *EmitX86SubVectorBroadcast(CodeGenFunction &CGF, - SmallVectorImpl<Value *> &Ops, - llvm::Type *DstTy, - unsigned SrcSizeInBits, - unsigned Align) { - // Load the subvector. - Ops[0] = CGF.Builder.CreateAlignedLoad(Ops[0], Align); - - // Create broadcast mask. - unsigned NumDstElts = DstTy->getVectorNumElements(); - unsigned NumSrcElts = SrcSizeInBits / DstTy->getScalarSizeInBits(); - - SmallVector<uint32_t, 8> Mask; - for (unsigned i = 0; i != NumDstElts; i += NumSrcElts) - for (unsigned j = 0; j != NumSrcElts; ++j) - Mask.push_back(j); - - return CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], Mask, "subvecbcst"); -} - static Value *EmitX86Select(CodeGenFunction &CGF, Value *Mask, Value *Op0, Value *Op1) { @@ -7788,8 +8549,48 @@ static Value *EmitX86Select(CodeGenFunction &CGF, return CGF.Builder.CreateSelect(Mask, Op0, Op1); } +static Value *EmitX86ScalarSelect(CodeGenFunction &CGF, + Value *Mask, Value *Op0, Value *Op1) { + // If the mask is all ones just return first argument. + if (const auto *C = dyn_cast<Constant>(Mask)) + if (C->isAllOnesValue()) + return Op0; + + llvm::VectorType *MaskTy = + llvm::VectorType::get(CGF.Builder.getInt1Ty(), + Mask->getType()->getIntegerBitWidth()); + Mask = CGF.Builder.CreateBitCast(Mask, MaskTy); + Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0); + return CGF.Builder.CreateSelect(Mask, Op0, Op1); +} + +static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp, + unsigned NumElts, Value *MaskIn) { + if (MaskIn) { + const auto *C = dyn_cast<Constant>(MaskIn); + if (!C || !C->isAllOnesValue()) + Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts)); + } + + if (NumElts < 8) { + uint32_t Indices[8]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + for (unsigned i = NumElts; i != 8; ++i) + Indices[i] = i % NumElts + NumElts; + Cmp = CGF.Builder.CreateShuffleVector( + Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices); + } + + return CGF.Builder.CreateBitCast(Cmp, + IntegerType::get(CGF.getLLVMContext(), + std::max(NumElts, 8U))); +} + static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, - bool Signed, SmallVectorImpl<Value *> &Ops) { + bool Signed, ArrayRef<Value *> Ops) { + assert((Ops.size() == 2 || Ops.size() == 4) && + "Unexpected number of arguments"); unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); Value *Cmp; @@ -7813,22 +8614,16 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]); } - const auto *C = dyn_cast<Constant>(Ops.back()); - if (!C || !C->isAllOnesValue()) - Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, Ops.back(), NumElts)); + Value *MaskIn = nullptr; + if (Ops.size() == 4) + MaskIn = Ops[3]; - if (NumElts < 8) { - uint32_t Indices[8]; - for (unsigned i = 0; i != NumElts; ++i) - Indices[i] = i; - for (unsigned i = NumElts; i != 8; ++i) - Indices[i] = i % NumElts + NumElts; - Cmp = CGF.Builder.CreateShuffleVector( - Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices); - } - return CGF.Builder.CreateBitCast(Cmp, - IntegerType::get(CGF.getLLVMContext(), - std::max(NumElts, 8U))); + return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn); +} + +static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) { + Value *Zero = Constant::getNullValue(In->getType()); + return EmitX86MaskedCompare(CGF, 1, true, { In, Zero }); } static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) { @@ -7838,9 +8633,7 @@ static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) { Value *Sub = CGF.Builder.CreateSub(Zero, Ops[0]); Value *Cmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Zero); Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Sub); - if (Ops.size() == 1) - return Res; - return EmitX86Select(CGF, Ops[2], Res, Ops[1]); + return Res; } static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred, @@ -7848,14 +8641,214 @@ static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred, Value *Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]); Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Ops[1]); - if (Ops.size() == 2) - return Res; + assert(Ops.size() == 2); + return Res; +} + +// Lowers X86 FMA intrinsics to IR. +static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef<Value *> Ops, + unsigned BuiltinID, bool IsAddSub) { + + bool Subtract = false; + Intrinsic::ID IID = Intrinsic::not_intrinsic; + switch (BuiltinID) { + default: break; + case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break; + case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break; + case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512; + break; + case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + Subtract = true; + LLVM_FALLTHROUGH; + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512; + break; + } + + Value *A = Ops[0]; + Value *B = Ops[1]; + Value *C = Ops[2]; + + if (Subtract) + C = CGF.Builder.CreateFNeg(C); + + Value *Res; + + // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding). + if (IID != Intrinsic::not_intrinsic && + cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4) { + Function *Intr = CGF.CGM.getIntrinsic(IID); + Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() }); + } else { + llvm::Type *Ty = A->getType(); + Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty); + Res = CGF.Builder.CreateCall(FMA, {A, B, C} ); + + if (IsAddSub) { + // Negate even elts in C using a mask. + unsigned NumElts = Ty->getVectorNumElements(); + SmallVector<uint32_t, 16> Indices(NumElts); + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i + (i % 2) * NumElts; + + Value *NegC = CGF.Builder.CreateFNeg(C); + Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} ); + Res = CGF.Builder.CreateShuffleVector(FMSub, Res, Indices); + } + } + + // Handle any required masking. + Value *MaskFalseVal = nullptr; + switch (BuiltinID) { + case clang::X86::BI__builtin_ia32_vfmaddps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask: + MaskFalseVal = Ops[0]; + break; + case clang::X86::BI__builtin_ia32_vfmaddps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + MaskFalseVal = Constant::getNullValue(Ops[0]->getType()); + break; + case clang::X86::BI__builtin_ia32_vfmsubps512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddps512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3: + case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + MaskFalseVal = Ops[2]; + break; + } + + if (MaskFalseVal) + return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal); + + return Res; +} + +static Value * +EmitScalarFMAExpr(CodeGenFunction &CGF, MutableArrayRef<Value *> Ops, + Value *Upper, bool ZeroMask = false, unsigned PTIdx = 0, + bool NegAcc = false) { + unsigned Rnd = 4; + if (Ops.size() > 4) + Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); + + if (NegAcc) + Ops[2] = CGF.Builder.CreateFNeg(Ops[2]); + + Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0); + Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0); + Value *Res; + if (Rnd != 4) { + Intrinsic::ID IID = Ops[0]->getType()->getPrimitiveSizeInBits() == 32 ? + Intrinsic::x86_avx512_vfmadd_f32 : + Intrinsic::x86_avx512_vfmadd_f64; + Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), + {Ops[0], Ops[1], Ops[2], Ops[4]}); + } else { + Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType()); + Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3)); + } + // If we have more than 3 arguments, we need to do masking. + if (Ops.size() > 3) { + Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType()) + : Ops[PTIdx]; - assert(Ops.size() == 4); - return EmitX86Select(CGF, Ops[3], Res, Ops[2]); + // If we negated the accumulator and the its the PassThru value we need to + // bypass the negate. Conveniently Upper should be the same thing in this + // case. + if (NegAcc && PTIdx == 2) + PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0); + + Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru); + } + return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0); +} + +static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, + ArrayRef<Value *> Ops) { + llvm::Type *Ty = Ops[0]->getType(); + // Arguments have a vXi32 type so cast to vXi64. + Ty = llvm::VectorType::get(CGF.Int64Ty, + Ty->getPrimitiveSizeInBits() / 64); + Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty); + Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty); + + if (IsSigned) { + // Shift left then arithmetic shift right. + Constant *ShiftAmt = ConstantInt::get(Ty, 32); + LHS = CGF.Builder.CreateShl(LHS, ShiftAmt); + LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt); + RHS = CGF.Builder.CreateShl(RHS, ShiftAmt); + RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt); + } else { + // Clear the upper bits. + Constant *Mask = ConstantInt::get(Ty, 0xffffffff); + LHS = CGF.Builder.CreateAnd(LHS, Mask); + RHS = CGF.Builder.CreateAnd(RHS, Mask); + } + + return CGF.Builder.CreateMul(LHS, RHS); } -static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, +// Emit a masked pternlog intrinsic. This only exists because the header has to +// use a macro and we aren't able to pass the input argument to a pternlog +// builtin and a select builtin without evaluating it twice. +static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask, + ArrayRef<Value *> Ops) { + llvm::Type *Ty = Ops[0]->getType(); + + unsigned VecWidth = Ty->getPrimitiveSizeInBits(); + unsigned EltWidth = Ty->getScalarSizeInBits(); + Intrinsic::ID IID; + if (VecWidth == 128 && EltWidth == 32) + IID = Intrinsic::x86_avx512_pternlog_d_128; + else if (VecWidth == 256 && EltWidth == 32) + IID = Intrinsic::x86_avx512_pternlog_d_256; + else if (VecWidth == 512 && EltWidth == 32) + IID = Intrinsic::x86_avx512_pternlog_d_512; + else if (VecWidth == 128 && EltWidth == 64) + IID = Intrinsic::x86_avx512_pternlog_q_128; + else if (VecWidth == 256 && EltWidth == 64) + IID = Intrinsic::x86_avx512_pternlog_q_256; + else if (VecWidth == 512 && EltWidth == 64) + IID = Intrinsic::x86_avx512_pternlog_q_512; + else + llvm_unreachable("Unexpected intrinsic"); + + Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), + Ops.drop_back()); + Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0]; + return EmitX86Select(CGF, Ops[4], Ternlog, PassThru); +} + +static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, llvm::Type *DstTy) { unsigned NumberOfElements = DstTy->getVectorNumElements(); Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements); @@ -7918,11 +8911,10 @@ Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) { return EmitX86CpuSupports(FeatureStr); } -Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) { +uint32_t +CodeGenFunction::GetX86CpuSupportsMask(ArrayRef<StringRef> FeatureStrs) { // Processor features and mapping to processor feature value. - uint32_t FeaturesMask = 0; - for (const StringRef &FeatureStr : FeatureStrs) { unsigned Feature = StringSwitch<unsigned>(FeatureStr) @@ -7931,7 +8923,14 @@ Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) { ; FeaturesMask |= (1U << Feature); } + return FeaturesMask; +} +Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) { + return EmitX86CpuSupports(GetX86CpuSupportsMask(FeatureStrs)); +} + +llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint32_t FeaturesMask) { // Matching the struct layout from the compiler-rt/libgcc structure that is // filled in: // unsigned int __cpu_vendor; @@ -7953,9 +8952,9 @@ Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) { Builder.CreateAlignedLoad(CpuFeatures, CharUnits::fromQuantity(4)); // Check the value of the bit corresponding to the feature requested. - Value *Bitset = Builder.CreateAnd( - Features, llvm::ConstantInt::get(Int32Ty, FeaturesMask)); - return Builder.CreateICmpNE(Bitset, llvm::ConstantInt::get(Int32Ty, 0)); + Value *Mask = Builder.getInt32(FeaturesMask); + Value *Bitset = Builder.CreateAnd(Features, Mask); + return Builder.CreateICmpEQ(Bitset, Mask); } Value *CodeGenFunction::EmitX86CpuInit() { @@ -8067,8 +9066,37 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateBitCast(BuildVector(Ops), llvm::Type::getX86_MMXTy(getLLVMContext())); case X86::BI__builtin_ia32_vec_ext_v2si: - return Builder.CreateExtractElement(Ops[0], - llvm::ConstantInt::get(Ops[1]->getType(), 0)); + case X86::BI__builtin_ia32_vec_ext_v16qi: + case X86::BI__builtin_ia32_vec_ext_v8hi: + case X86::BI__builtin_ia32_vec_ext_v4si: + case X86::BI__builtin_ia32_vec_ext_v4sf: + case X86::BI__builtin_ia32_vec_ext_v2di: + case X86::BI__builtin_ia32_vec_ext_v32qi: + case X86::BI__builtin_ia32_vec_ext_v16hi: + case X86::BI__builtin_ia32_vec_ext_v8si: + case X86::BI__builtin_ia32_vec_ext_v4di: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue(); + Index &= NumElts - 1; + // These builtins exist so we can ensure the index is an ICE and in range. + // Otherwise we could just do this in the header file. + return Builder.CreateExtractElement(Ops[0], Index); + } + case X86::BI__builtin_ia32_vec_set_v16qi: + case X86::BI__builtin_ia32_vec_set_v8hi: + case X86::BI__builtin_ia32_vec_set_v4si: + case X86::BI__builtin_ia32_vec_set_v2di: + case X86::BI__builtin_ia32_vec_set_v32qi: + case X86::BI__builtin_ia32_vec_set_v16hi: + case X86::BI__builtin_ia32_vec_set_v8si: + case X86::BI__builtin_ia32_vec_set_v4di: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue(); + Index &= NumElts - 1; + // These builtins exist so we can ensure the index is an ICE and in range. + // Otherwise we could just do this in the header file. + return Builder.CreateInsertElement(Ops[0], Ops[1], Index); + } case X86::BI_mm_setcsr: case X86::BI__builtin_ia32_ldmxcsr: { Address Tmp = CreateMemTemp(E->getArg(0)->getType()); @@ -8145,7 +9173,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_storess128_mask: case X86::BI__builtin_ia32_storesd128_mask: { - return EmitX86MaskedStore(*this, Ops, 16); + return EmitX86MaskedStore(*this, Ops, 1); } case X86::BI__builtin_ia32_vpopcntb_128: case X86::BI__builtin_ia32_vpopcntd_128: @@ -8177,6 +9205,66 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_cvtmask2q512: return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType())); + case X86::BI__builtin_ia32_cvtb2mask128: + case X86::BI__builtin_ia32_cvtb2mask256: + case X86::BI__builtin_ia32_cvtb2mask512: + case X86::BI__builtin_ia32_cvtw2mask128: + case X86::BI__builtin_ia32_cvtw2mask256: + case X86::BI__builtin_ia32_cvtw2mask512: + case X86::BI__builtin_ia32_cvtd2mask128: + case X86::BI__builtin_ia32_cvtd2mask256: + case X86::BI__builtin_ia32_cvtd2mask512: + case X86::BI__builtin_ia32_cvtq2mask128: + case X86::BI__builtin_ia32_cvtq2mask256: + case X86::BI__builtin_ia32_cvtq2mask512: + return EmitX86ConvertToMask(*this, Ops[0]); + + case X86::BI__builtin_ia32_vfmaddss3: + case X86::BI__builtin_ia32_vfmaddsd3: + case X86::BI__builtin_ia32_vfmaddss3_mask: + case X86::BI__builtin_ia32_vfmaddsd3_mask: + return EmitScalarFMAExpr(*this, Ops, Ops[0]); + case X86::BI__builtin_ia32_vfmaddss: + case X86::BI__builtin_ia32_vfmaddsd: + return EmitScalarFMAExpr(*this, Ops, + Constant::getNullValue(Ops[0]->getType())); + case X86::BI__builtin_ia32_vfmaddss3_maskz: + case X86::BI__builtin_ia32_vfmaddsd3_maskz: + return EmitScalarFMAExpr(*this, Ops, Ops[0], /*ZeroMask*/true); + case X86::BI__builtin_ia32_vfmaddss3_mask3: + case X86::BI__builtin_ia32_vfmaddsd3_mask3: + return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2); + case X86::BI__builtin_ia32_vfmsubss3_mask3: + case X86::BI__builtin_ia32_vfmsubsd3_mask3: + return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2, + /*NegAcc*/true); + case X86::BI__builtin_ia32_vfmaddps: + case X86::BI__builtin_ia32_vfmaddpd: + case X86::BI__builtin_ia32_vfmaddps256: + case X86::BI__builtin_ia32_vfmaddpd256: + case X86::BI__builtin_ia32_vfmaddps512_mask: + case X86::BI__builtin_ia32_vfmaddps512_maskz: + case X86::BI__builtin_ia32_vfmaddps512_mask3: + case X86::BI__builtin_ia32_vfmsubps512_mask3: + case X86::BI__builtin_ia32_vfmaddpd512_mask: + case X86::BI__builtin_ia32_vfmaddpd512_maskz: + case X86::BI__builtin_ia32_vfmaddpd512_mask3: + case X86::BI__builtin_ia32_vfmsubpd512_mask3: + return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/false); + case X86::BI__builtin_ia32_vfmaddsubps: + case X86::BI__builtin_ia32_vfmaddsubpd: + case X86::BI__builtin_ia32_vfmaddsubps256: + case X86::BI__builtin_ia32_vfmaddsubpd256: + case X86::BI__builtin_ia32_vfmaddsubps512_mask: + case X86::BI__builtin_ia32_vfmaddsubps512_maskz: + case X86::BI__builtin_ia32_vfmaddsubps512_mask3: + case X86::BI__builtin_ia32_vfmsubaddps512_mask3: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask: + case X86::BI__builtin_ia32_vfmaddsubpd512_maskz: + case X86::BI__builtin_ia32_vfmaddsubpd512_mask3: + case X86::BI__builtin_ia32_vfmsubaddpd512_mask3: + return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/true); + case X86::BI__builtin_ia32_movdqa32store128_mask: case X86::BI__builtin_ia32_movdqa64store128_mask: case X86::BI__builtin_ia32_storeaps128_mask: @@ -8215,7 +9303,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_loadss128_mask: case X86::BI__builtin_ia32_loadsd128_mask: - return EmitX86MaskedLoad(*this, Ops, 16); + return EmitX86MaskedLoad(*this, Ops, 1); case X86::BI__builtin_ia32_loadaps128_mask: case X86::BI__builtin_ia32_loadaps256_mask: @@ -8234,11 +9322,45 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return EmitX86MaskedLoad(*this, Ops, Align); } - case X86::BI__builtin_ia32_vbroadcastf128_pd256: - case X86::BI__builtin_ia32_vbroadcastf128_ps256: { - llvm::Type *DstTy = ConvertType(E->getType()); - return EmitX86SubVectorBroadcast(*this, Ops, DstTy, 128, 1); - } + case X86::BI__builtin_ia32_expandloaddf128_mask: + case X86::BI__builtin_ia32_expandloaddf256_mask: + case X86::BI__builtin_ia32_expandloaddf512_mask: + case X86::BI__builtin_ia32_expandloadsf128_mask: + case X86::BI__builtin_ia32_expandloadsf256_mask: + case X86::BI__builtin_ia32_expandloadsf512_mask: + case X86::BI__builtin_ia32_expandloaddi128_mask: + case X86::BI__builtin_ia32_expandloaddi256_mask: + case X86::BI__builtin_ia32_expandloaddi512_mask: + case X86::BI__builtin_ia32_expandloadsi128_mask: + case X86::BI__builtin_ia32_expandloadsi256_mask: + case X86::BI__builtin_ia32_expandloadsi512_mask: + case X86::BI__builtin_ia32_expandloadhi128_mask: + case X86::BI__builtin_ia32_expandloadhi256_mask: + case X86::BI__builtin_ia32_expandloadhi512_mask: + case X86::BI__builtin_ia32_expandloadqi128_mask: + case X86::BI__builtin_ia32_expandloadqi256_mask: + case X86::BI__builtin_ia32_expandloadqi512_mask: + return EmitX86ExpandLoad(*this, Ops); + + case X86::BI__builtin_ia32_compressstoredf128_mask: + case X86::BI__builtin_ia32_compressstoredf256_mask: + case X86::BI__builtin_ia32_compressstoredf512_mask: + case X86::BI__builtin_ia32_compressstoresf128_mask: + case X86::BI__builtin_ia32_compressstoresf256_mask: + case X86::BI__builtin_ia32_compressstoresf512_mask: + case X86::BI__builtin_ia32_compressstoredi128_mask: + case X86::BI__builtin_ia32_compressstoredi256_mask: + case X86::BI__builtin_ia32_compressstoredi512_mask: + case X86::BI__builtin_ia32_compressstoresi128_mask: + case X86::BI__builtin_ia32_compressstoresi256_mask: + case X86::BI__builtin_ia32_compressstoresi512_mask: + case X86::BI__builtin_ia32_compressstorehi128_mask: + case X86::BI__builtin_ia32_compressstorehi256_mask: + case X86::BI__builtin_ia32_compressstorehi512_mask: + case X86::BI__builtin_ia32_compressstoreqi128_mask: + case X86::BI__builtin_ia32_compressstoreqi256_mask: + case X86::BI__builtin_ia32_compressstoreqi512_mask: + return EmitX86CompressStore(*this, Ops); case X86::BI__builtin_ia32_storehps: case X86::BI__builtin_ia32_storelps: { @@ -8250,17 +9372,275 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // extract (0, 1) unsigned Index = BuiltinID == X86::BI__builtin_ia32_storelps ? 0 : 1; - llvm::Value *Idx = llvm::ConstantInt::get(SizeTy, Index); - Ops[1] = Builder.CreateExtractElement(Ops[1], Idx, "extract"); + Ops[1] = Builder.CreateExtractElement(Ops[1], Index, "extract"); // cast pointer to i64 & store Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy); return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); } + case X86::BI__builtin_ia32_vextractf128_pd256: + case X86::BI__builtin_ia32_vextractf128_ps256: + case X86::BI__builtin_ia32_vextractf128_si256: + case X86::BI__builtin_ia32_extract128i256: + case X86::BI__builtin_ia32_extractf64x4_mask: + case X86::BI__builtin_ia32_extractf32x4_mask: + case X86::BI__builtin_ia32_extracti64x4_mask: + case X86::BI__builtin_ia32_extracti32x4_mask: + case X86::BI__builtin_ia32_extractf32x8_mask: + case X86::BI__builtin_ia32_extracti32x8_mask: + case X86::BI__builtin_ia32_extractf32x4_256_mask: + case X86::BI__builtin_ia32_extracti32x4_256_mask: + case X86::BI__builtin_ia32_extractf64x2_256_mask: + case X86::BI__builtin_ia32_extracti64x2_256_mask: + case X86::BI__builtin_ia32_extractf64x2_512_mask: + case X86::BI__builtin_ia32_extracti64x2_512_mask: { + llvm::Type *DstTy = ConvertType(E->getType()); + unsigned NumElts = DstTy->getVectorNumElements(); + unsigned SrcNumElts = Ops[0]->getType()->getVectorNumElements(); + unsigned SubVectors = SrcNumElts / NumElts; + unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue(); + assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); + Index &= SubVectors - 1; // Remove any extra bits. + Index *= NumElts; + + uint32_t Indices[16]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i + Index; + + Value *Res = Builder.CreateShuffleVector(Ops[0], + UndefValue::get(Ops[0]->getType()), + makeArrayRef(Indices, NumElts), + "extract"); + + if (Ops.size() == 4) + Res = EmitX86Select(*this, Ops[3], Res, Ops[2]); + + return Res; + } + case X86::BI__builtin_ia32_vinsertf128_pd256: + case X86::BI__builtin_ia32_vinsertf128_ps256: + case X86::BI__builtin_ia32_vinsertf128_si256: + case X86::BI__builtin_ia32_insert128i256: + case X86::BI__builtin_ia32_insertf64x4: + case X86::BI__builtin_ia32_insertf32x4: + case X86::BI__builtin_ia32_inserti64x4: + case X86::BI__builtin_ia32_inserti32x4: + case X86::BI__builtin_ia32_insertf32x8: + case X86::BI__builtin_ia32_inserti32x8: + case X86::BI__builtin_ia32_insertf32x4_256: + case X86::BI__builtin_ia32_inserti32x4_256: + case X86::BI__builtin_ia32_insertf64x2_256: + case X86::BI__builtin_ia32_inserti64x2_256: + case X86::BI__builtin_ia32_insertf64x2_512: + case X86::BI__builtin_ia32_inserti64x2_512: { + unsigned DstNumElts = Ops[0]->getType()->getVectorNumElements(); + unsigned SrcNumElts = Ops[1]->getType()->getVectorNumElements(); + unsigned SubVectors = DstNumElts / SrcNumElts; + unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue(); + assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); + Index &= SubVectors - 1; // Remove any extra bits. + Index *= SrcNumElts; + + uint32_t Indices[16]; + for (unsigned i = 0; i != DstNumElts; ++i) + Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i; + + Value *Op1 = Builder.CreateShuffleVector(Ops[1], + UndefValue::get(Ops[1]->getType()), + makeArrayRef(Indices, DstNumElts), + "widen"); + + for (unsigned i = 0; i != DstNumElts; ++i) { + if (i >= Index && i < (Index + SrcNumElts)) + Indices[i] = (i - Index) + DstNumElts; + else + Indices[i] = i; + } + + return Builder.CreateShuffleVector(Ops[0], Op1, + makeArrayRef(Indices, DstNumElts), + "insert"); + } + case X86::BI__builtin_ia32_pmovqd512_mask: + case X86::BI__builtin_ia32_pmovwb512_mask: { + Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType()); + return EmitX86Select(*this, Ops[2], Res, Ops[1]); + } + case X86::BI__builtin_ia32_pmovdb512_mask: + case X86::BI__builtin_ia32_pmovdw512_mask: + case X86::BI__builtin_ia32_pmovqw512_mask: { + if (const auto *C = dyn_cast<Constant>(Ops[2])) + if (C->isAllOnesValue()) + return Builder.CreateTrunc(Ops[0], Ops[1]->getType()); + + Intrinsic::ID IID; + switch (BuiltinID) { + default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_pmovdb512_mask: + IID = Intrinsic::x86_avx512_mask_pmov_db_512; + break; + case X86::BI__builtin_ia32_pmovdw512_mask: + IID = Intrinsic::x86_avx512_mask_pmov_dw_512; + break; + case X86::BI__builtin_ia32_pmovqw512_mask: + IID = Intrinsic::x86_avx512_mask_pmov_qw_512; + break; + } + + Function *Intr = CGM.getIntrinsic(IID); + return Builder.CreateCall(Intr, Ops); + } + case X86::BI__builtin_ia32_pblendw128: + case X86::BI__builtin_ia32_blendpd: + case X86::BI__builtin_ia32_blendps: + case X86::BI__builtin_ia32_blendpd256: + case X86::BI__builtin_ia32_blendps256: + case X86::BI__builtin_ia32_pblendw256: + case X86::BI__builtin_ia32_pblendd128: + case X86::BI__builtin_ia32_pblendd256: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); + + uint32_t Indices[16]; + // If there are more than 8 elements, the immediate is used twice so make + // sure we handle that. + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i; + + return Builder.CreateShuffleVector(Ops[0], Ops[1], + makeArrayRef(Indices, NumElts), + "blend"); + } + case X86::BI__builtin_ia32_pshuflw: + case X86::BI__builtin_ia32_pshuflw256: + case X86::BI__builtin_ia32_pshuflw512: { + uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); + llvm::Type *Ty = Ops[0]->getType(); + unsigned NumElts = Ty->getVectorNumElements(); + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + Imm = (Imm & 0xff) * 0x01010101; + + uint32_t Indices[32]; + for (unsigned l = 0; l != NumElts; l += 8) { + for (unsigned i = 0; i != 4; ++i) { + Indices[l + i] = l + (Imm & 3); + Imm >>= 2; + } + for (unsigned i = 4; i != 8; ++i) + Indices[l + i] = l + i; + } + + return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty), + makeArrayRef(Indices, NumElts), + "pshuflw"); + } + case X86::BI__builtin_ia32_pshufhw: + case X86::BI__builtin_ia32_pshufhw256: + case X86::BI__builtin_ia32_pshufhw512: { + uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); + llvm::Type *Ty = Ops[0]->getType(); + unsigned NumElts = Ty->getVectorNumElements(); + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + Imm = (Imm & 0xff) * 0x01010101; + + uint32_t Indices[32]; + for (unsigned l = 0; l != NumElts; l += 8) { + for (unsigned i = 0; i != 4; ++i) + Indices[l + i] = l + i; + for (unsigned i = 4; i != 8; ++i) { + Indices[l + i] = l + 4 + (Imm & 3); + Imm >>= 2; + } + } + + return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty), + makeArrayRef(Indices, NumElts), + "pshufhw"); + } + case X86::BI__builtin_ia32_pshufd: + case X86::BI__builtin_ia32_pshufd256: + case X86::BI__builtin_ia32_pshufd512: + case X86::BI__builtin_ia32_vpermilpd: + case X86::BI__builtin_ia32_vpermilps: + case X86::BI__builtin_ia32_vpermilpd256: + case X86::BI__builtin_ia32_vpermilps256: + case X86::BI__builtin_ia32_vpermilpd512: + case X86::BI__builtin_ia32_vpermilps512: { + uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); + llvm::Type *Ty = Ops[0]->getType(); + unsigned NumElts = Ty->getVectorNumElements(); + unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + Imm = (Imm & 0xff) * 0x01010101; + + uint32_t Indices[16]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + Indices[i + l] = (Imm % NumLaneElts) + l; + Imm /= NumLaneElts; + } + } + + return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty), + makeArrayRef(Indices, NumElts), + "permil"); + } + case X86::BI__builtin_ia32_shufpd: + case X86::BI__builtin_ia32_shufpd256: + case X86::BI__builtin_ia32_shufpd512: + case X86::BI__builtin_ia32_shufps: + case X86::BI__builtin_ia32_shufps256: + case X86::BI__builtin_ia32_shufps512: { + uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); + llvm::Type *Ty = Ops[0]->getType(); + unsigned NumElts = Ty->getVectorNumElements(); + unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + // Splat the 8-bits of immediate 4 times to help the loop wrap around. + Imm = (Imm & 0xff) * 0x01010101; + + uint32_t Indices[16]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Index = Imm % NumLaneElts; + Imm /= NumLaneElts; + if (i >= (NumLaneElts / 2)) + Index += NumElts; + Indices[l + i] = l + Index; + } + } + + return Builder.CreateShuffleVector(Ops[0], Ops[1], + makeArrayRef(Indices, NumElts), + "shufp"); + } + case X86::BI__builtin_ia32_permdi256: + case X86::BI__builtin_ia32_permdf256: + case X86::BI__builtin_ia32_permdi512: + case X86::BI__builtin_ia32_permdf512: { + unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); + llvm::Type *Ty = Ops[0]->getType(); + unsigned NumElts = Ty->getVectorNumElements(); + + // These intrinsics operate on 256-bit lanes of four 64-bit elements. + uint32_t Indices[8]; + for (unsigned l = 0; l != NumElts; l += 4) + for (unsigned i = 0; i != 4; ++i) + Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3); + + return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty), + makeArrayRef(Indices, NumElts), + "perm"); + } case X86::BI__builtin_ia32_palignr128: case X86::BI__builtin_ia32_palignr256: - case X86::BI__builtin_ia32_palignr512_mask: { - unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); + case X86::BI__builtin_ia32_palignr512: { + unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff; unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); assert(NumElts % 16 == 0); @@ -8289,15 +9669,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } } - Value *Align = Builder.CreateShuffleVector(Ops[1], Ops[0], - makeArrayRef(Indices, NumElts), - "palignr"); + return Builder.CreateShuffleVector(Ops[1], Ops[0], + makeArrayRef(Indices, NumElts), + "palignr"); + } + case X86::BI__builtin_ia32_alignd128: + case X86::BI__builtin_ia32_alignd256: + case X86::BI__builtin_ia32_alignd512: + case X86::BI__builtin_ia32_alignq128: + case X86::BI__builtin_ia32_alignq256: + case X86::BI__builtin_ia32_alignq512: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff; + + // Mask the shift amount to width of two vectors. + ShiftVal &= (2 * NumElts) - 1; - // If this isn't a masked builtin, just return the align operation. - if (Ops.size() == 3) - return Align; + uint32_t Indices[16]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i + ShiftVal; - return EmitX86Select(*this, Ops[4], Align, Ops[3]); + return Builder.CreateShuffleVector(Ops[1], Ops[0], + makeArrayRef(Indices, NumElts), + "valign"); + } + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_i64x2: { + unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); + llvm::Type *Ty = Ops[0]->getType(); + unsigned NumElts = Ty->getVectorNumElements(); + unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2; + unsigned NumLaneElts = NumElts / NumLanes; + + uint32_t Indices[16]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + unsigned Index = (Imm % NumLanes) * NumLaneElts; + Imm /= NumLanes; // Discard the bits we just used. + if (l >= (NumElts / 2)) + Index += NumElts; // Switch to other source. + for (unsigned i = 0; i != NumLaneElts; ++i) { + Indices[l + i] = Index + i; + } + } + + return Builder.CreateShuffleVector(Ops[0], Ops[1], + makeArrayRef(Indices, NumElts), + "shuf"); } case X86::BI__builtin_ia32_vperm2f128_pd256: @@ -8339,6 +9762,66 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, "vperm"); } + case X86::BI__builtin_ia32_pslldqi128_byteshift: + case X86::BI__builtin_ia32_pslldqi256_byteshift: + case X86::BI__builtin_ia32_pslldqi512_byteshift: { + unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; + llvm::Type *ResultType = Ops[0]->getType(); + // Builtin type is vXi64 so multiply by 8 to get bytes. + unsigned NumElts = ResultType->getVectorNumElements() * 8; + + // If pslldq is shifting the vector more than 15 bytes, emit zero. + if (ShiftVal >= 16) + return llvm::Constant::getNullValue(ResultType); + + uint32_t Indices[64]; + // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that + for (unsigned l = 0; l != NumElts; l += 16) { + for (unsigned i = 0; i != 16; ++i) { + unsigned Idx = NumElts + i - ShiftVal; + if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand. + Indices[l + i] = Idx + l; + } + } + + llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts); + Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); + Value *Zero = llvm::Constant::getNullValue(VecTy); + Value *SV = Builder.CreateShuffleVector(Zero, Cast, + makeArrayRef(Indices, NumElts), + "pslldq"); + return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast"); + } + case X86::BI__builtin_ia32_psrldqi128_byteshift: + case X86::BI__builtin_ia32_psrldqi256_byteshift: + case X86::BI__builtin_ia32_psrldqi512_byteshift: { + unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; + llvm::Type *ResultType = Ops[0]->getType(); + // Builtin type is vXi64 so multiply by 8 to get bytes. + unsigned NumElts = ResultType->getVectorNumElements() * 8; + + // If psrldq is shifting the vector more than 15 bytes, emit zero. + if (ShiftVal >= 16) + return llvm::Constant::getNullValue(ResultType); + + uint32_t Indices[64]; + // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that + for (unsigned l = 0; l != NumElts; l += 16) { + for (unsigned i = 0; i != 16; ++i) { + unsigned Idx = i + ShiftVal; + if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand. + Indices[l + i] = Idx + l; + } + } + + llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts); + Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); + Value *Zero = llvm::Constant::getNullValue(VecTy); + Value *SV = Builder.CreateShuffleVector(Cast, Zero, + makeArrayRef(Indices, NumElts), + "psrldq"); + return Builder.CreateBitCast(SV, ResultType, "cast"); + } case X86::BI__builtin_ia32_movnti: case X86::BI__builtin_ia32_movnti64: case X86::BI__builtin_ia32_movntsd: @@ -8384,6 +9867,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_selectpd_256: case X86::BI__builtin_ia32_selectpd_512: return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); + case X86::BI__builtin_ia32_selectss_128: + case X86::BI__builtin_ia32_selectsd_128: { + Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0); + A = EmitX86ScalarSelect(*this, Ops[0], A, B); + return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0); + } case X86::BI__builtin_ia32_cmpb128_mask: case X86::BI__builtin_ia32_cmpb256_mask: case X86::BI__builtin_ia32_cmpb512_mask: @@ -8415,6 +9905,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return EmitX86MaskedCompare(*this, CC, false, Ops); } + case X86::BI__builtin_ia32_kortestchi: + case X86::BI__builtin_ia32_kortestzhi: { + Value *Or = EmitX86MaskLogic(*this, Instruction::Or, 16, Ops); + Value *C; + if (BuiltinID == X86::BI__builtin_ia32_kortestchi) + C = llvm::Constant::getAllOnesValue(Builder.getInt16Ty()); + else + C = llvm::Constant::getNullValue(Builder.getInt16Ty()); + Value *Cmp = Builder.CreateICmpEQ(Or, C); + return Builder.CreateZExt(Cmp, ConvertType(E->getType())); + } + case X86::BI__builtin_ia32_kandhi: return EmitX86MaskLogic(*this, Instruction::And, 16, Ops); case X86::BI__builtin_ia32_kandnhi: @@ -8431,85 +9933,176 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Builder.getInt16Ty()); } - case X86::BI__builtin_ia32_vplzcntd_128_mask: - case X86::BI__builtin_ia32_vplzcntd_256_mask: - case X86::BI__builtin_ia32_vplzcntd_512_mask: - case X86::BI__builtin_ia32_vplzcntq_128_mask: - case X86::BI__builtin_ia32_vplzcntq_256_mask: - case X86::BI__builtin_ia32_vplzcntq_512_mask: { + case X86::BI__builtin_ia32_kunpckdi: + case X86::BI__builtin_ia32_kunpcksi: + case X86::BI__builtin_ia32_kunpckhi: { + unsigned NumElts = Ops[0]->getType()->getScalarSizeInBits(); + Value *LHS = getMaskVecValue(*this, Ops[0], NumElts); + Value *RHS = getMaskVecValue(*this, Ops[1], NumElts); + uint32_t Indices[64]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + + // First extract half of each vector. This gives better codegen than + // doing it in a single shuffle. + LHS = Builder.CreateShuffleVector(LHS, LHS, + makeArrayRef(Indices, NumElts / 2)); + RHS = Builder.CreateShuffleVector(RHS, RHS, + makeArrayRef(Indices, NumElts / 2)); + // Concat the vectors. + // NOTE: Operands are swapped to match the intrinsic definition. + Value *Res = Builder.CreateShuffleVector(RHS, LHS, + makeArrayRef(Indices, NumElts)); + return Builder.CreateBitCast(Res, Ops[0]->getType()); + } + + case X86::BI__builtin_ia32_vplzcntd_128: + case X86::BI__builtin_ia32_vplzcntd_256: + case X86::BI__builtin_ia32_vplzcntd_512: + case X86::BI__builtin_ia32_vplzcntq_128: + case X86::BI__builtin_ia32_vplzcntq_256: + case X86::BI__builtin_ia32_vplzcntq_512: { Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); - return EmitX86Select(*this, Ops[2], - Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}), - Ops[1]); + return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}); + } + case X86::BI__builtin_ia32_sqrtss: + case X86::BI__builtin_ia32_sqrtsd: { + Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0); + Function *F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType()); + A = Builder.CreateCall(F, {A}); + return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); + } + case X86::BI__builtin_ia32_sqrtsd_round_mask: + case X86::BI__builtin_ia32_sqrtss_round_mask: { + unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); + // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), + // otherwise keep the intrinsic. + if (CC != 4) { + Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtsd_round_mask ? + Intrinsic::x86_avx512_mask_sqrt_sd : + Intrinsic::x86_avx512_mask_sqrt_ss; + return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); + } + Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Function *F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType()); + A = Builder.CreateCall(F, A); + Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0); + A = EmitX86ScalarSelect(*this, Ops[3], A, Src); + return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); + } + case X86::BI__builtin_ia32_sqrtpd256: + case X86::BI__builtin_ia32_sqrtpd: + case X86::BI__builtin_ia32_sqrtps256: + case X86::BI__builtin_ia32_sqrtps: + case X86::BI__builtin_ia32_sqrtps512: + case X86::BI__builtin_ia32_sqrtpd512: { + if (Ops.size() == 2) { + unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue(); + // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), + // otherwise keep the intrinsic. + if (CC != 4) { + Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtps512 ? + Intrinsic::x86_avx512_sqrt_ps_512 : + Intrinsic::x86_avx512_sqrt_pd_512; + return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); + } + } + Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType()); + return Builder.CreateCall(F, Ops[0]); } - case X86::BI__builtin_ia32_pabsb128: case X86::BI__builtin_ia32_pabsw128: case X86::BI__builtin_ia32_pabsd128: case X86::BI__builtin_ia32_pabsb256: case X86::BI__builtin_ia32_pabsw256: case X86::BI__builtin_ia32_pabsd256: - case X86::BI__builtin_ia32_pabsq128_mask: - case X86::BI__builtin_ia32_pabsq256_mask: - case X86::BI__builtin_ia32_pabsb512_mask: - case X86::BI__builtin_ia32_pabsw512_mask: - case X86::BI__builtin_ia32_pabsd512_mask: - case X86::BI__builtin_ia32_pabsq512_mask: + case X86::BI__builtin_ia32_pabsq128: + case X86::BI__builtin_ia32_pabsq256: + case X86::BI__builtin_ia32_pabsb512: + case X86::BI__builtin_ia32_pabsw512: + case X86::BI__builtin_ia32_pabsd512: + case X86::BI__builtin_ia32_pabsq512: return EmitX86Abs(*this, Ops); case X86::BI__builtin_ia32_pmaxsb128: case X86::BI__builtin_ia32_pmaxsw128: case X86::BI__builtin_ia32_pmaxsd128: - case X86::BI__builtin_ia32_pmaxsq128_mask: + case X86::BI__builtin_ia32_pmaxsq128: case X86::BI__builtin_ia32_pmaxsb256: case X86::BI__builtin_ia32_pmaxsw256: case X86::BI__builtin_ia32_pmaxsd256: - case X86::BI__builtin_ia32_pmaxsq256_mask: - case X86::BI__builtin_ia32_pmaxsb512_mask: - case X86::BI__builtin_ia32_pmaxsw512_mask: - case X86::BI__builtin_ia32_pmaxsd512_mask: - case X86::BI__builtin_ia32_pmaxsq512_mask: + case X86::BI__builtin_ia32_pmaxsq256: + case X86::BI__builtin_ia32_pmaxsb512: + case X86::BI__builtin_ia32_pmaxsw512: + case X86::BI__builtin_ia32_pmaxsd512: + case X86::BI__builtin_ia32_pmaxsq512: return EmitX86MinMax(*this, ICmpInst::ICMP_SGT, Ops); case X86::BI__builtin_ia32_pmaxub128: case X86::BI__builtin_ia32_pmaxuw128: case X86::BI__builtin_ia32_pmaxud128: - case X86::BI__builtin_ia32_pmaxuq128_mask: + case X86::BI__builtin_ia32_pmaxuq128: case X86::BI__builtin_ia32_pmaxub256: case X86::BI__builtin_ia32_pmaxuw256: case X86::BI__builtin_ia32_pmaxud256: - case X86::BI__builtin_ia32_pmaxuq256_mask: - case X86::BI__builtin_ia32_pmaxub512_mask: - case X86::BI__builtin_ia32_pmaxuw512_mask: - case X86::BI__builtin_ia32_pmaxud512_mask: - case X86::BI__builtin_ia32_pmaxuq512_mask: + case X86::BI__builtin_ia32_pmaxuq256: + case X86::BI__builtin_ia32_pmaxub512: + case X86::BI__builtin_ia32_pmaxuw512: + case X86::BI__builtin_ia32_pmaxud512: + case X86::BI__builtin_ia32_pmaxuq512: return EmitX86MinMax(*this, ICmpInst::ICMP_UGT, Ops); case X86::BI__builtin_ia32_pminsb128: case X86::BI__builtin_ia32_pminsw128: case X86::BI__builtin_ia32_pminsd128: - case X86::BI__builtin_ia32_pminsq128_mask: + case X86::BI__builtin_ia32_pminsq128: case X86::BI__builtin_ia32_pminsb256: case X86::BI__builtin_ia32_pminsw256: case X86::BI__builtin_ia32_pminsd256: - case X86::BI__builtin_ia32_pminsq256_mask: - case X86::BI__builtin_ia32_pminsb512_mask: - case X86::BI__builtin_ia32_pminsw512_mask: - case X86::BI__builtin_ia32_pminsd512_mask: - case X86::BI__builtin_ia32_pminsq512_mask: + case X86::BI__builtin_ia32_pminsq256: + case X86::BI__builtin_ia32_pminsb512: + case X86::BI__builtin_ia32_pminsw512: + case X86::BI__builtin_ia32_pminsd512: + case X86::BI__builtin_ia32_pminsq512: return EmitX86MinMax(*this, ICmpInst::ICMP_SLT, Ops); case X86::BI__builtin_ia32_pminub128: case X86::BI__builtin_ia32_pminuw128: case X86::BI__builtin_ia32_pminud128: - case X86::BI__builtin_ia32_pminuq128_mask: + case X86::BI__builtin_ia32_pminuq128: case X86::BI__builtin_ia32_pminub256: case X86::BI__builtin_ia32_pminuw256: case X86::BI__builtin_ia32_pminud256: - case X86::BI__builtin_ia32_pminuq256_mask: - case X86::BI__builtin_ia32_pminub512_mask: - case X86::BI__builtin_ia32_pminuw512_mask: - case X86::BI__builtin_ia32_pminud512_mask: - case X86::BI__builtin_ia32_pminuq512_mask: + case X86::BI__builtin_ia32_pminuq256: + case X86::BI__builtin_ia32_pminub512: + case X86::BI__builtin_ia32_pminuw512: + case X86::BI__builtin_ia32_pminud512: + case X86::BI__builtin_ia32_pminuq512: return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops); + case X86::BI__builtin_ia32_pmuludq128: + case X86::BI__builtin_ia32_pmuludq256: + case X86::BI__builtin_ia32_pmuludq512: + return EmitX86Muldq(*this, /*IsSigned*/false, Ops); + + case X86::BI__builtin_ia32_pmuldq128: + case X86::BI__builtin_ia32_pmuldq256: + case X86::BI__builtin_ia32_pmuldq512: + return EmitX86Muldq(*this, /*IsSigned*/true, Ops); + + case X86::BI__builtin_ia32_pternlogd512_mask: + case X86::BI__builtin_ia32_pternlogq512_mask: + case X86::BI__builtin_ia32_pternlogd128_mask: + case X86::BI__builtin_ia32_pternlogd256_mask: + case X86::BI__builtin_ia32_pternlogq128_mask: + case X86::BI__builtin_ia32_pternlogq256_mask: + return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops); + + case X86::BI__builtin_ia32_pternlogd512_maskz: + case X86::BI__builtin_ia32_pternlogq512_maskz: + case X86::BI__builtin_ia32_pternlogd128_maskz: + case X86::BI__builtin_ia32_pternlogd256_maskz: + case X86::BI__builtin_ia32_pternlogq128_maskz: + case X86::BI__builtin_ia32_pternlogq256_maskz: + return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops); + // 3DNow! case X86::BI__builtin_ia32_pswapdsf: case X86::BI__builtin_ia32_pswapdsi: { @@ -8553,7 +10146,44 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateExtractValue(Call, 1); } - // SSE packed comparison intrinsics + case X86::BI__builtin_ia32_fpclassps128_mask: + case X86::BI__builtin_ia32_fpclassps256_mask: + case X86::BI__builtin_ia32_fpclassps512_mask: + case X86::BI__builtin_ia32_fpclasspd128_mask: + case X86::BI__builtin_ia32_fpclasspd256_mask: + case X86::BI__builtin_ia32_fpclasspd512_mask: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + Value *MaskIn = Ops[2]; + Ops.erase(&Ops[2]); + + Intrinsic::ID ID; + switch (BuiltinID) { + default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_fpclassps128_mask: + ID = Intrinsic::x86_avx512_fpclass_ps_128; + break; + case X86::BI__builtin_ia32_fpclassps256_mask: + ID = Intrinsic::x86_avx512_fpclass_ps_256; + break; + case X86::BI__builtin_ia32_fpclassps512_mask: + ID = Intrinsic::x86_avx512_fpclass_ps_512; + break; + case X86::BI__builtin_ia32_fpclasspd128_mask: + ID = Intrinsic::x86_avx512_fpclass_pd_128; + break; + case X86::BI__builtin_ia32_fpclasspd256_mask: + ID = Intrinsic::x86_avx512_fpclass_pd_256; + break; + case X86::BI__builtin_ia32_fpclasspd512_mask: + ID = Intrinsic::x86_avx512_fpclass_pd_512; + break; + } + + Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops); + return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn); + } + + // packed comparison intrinsics case X86::BI__builtin_ia32_cmpeqps: case X86::BI__builtin_ia32_cmpeqpd: return getVectorFCmpIR(CmpInst::FCMP_OEQ); @@ -8581,64 +10211,79 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_cmpps: case X86::BI__builtin_ia32_cmpps256: case X86::BI__builtin_ia32_cmppd: - case X86::BI__builtin_ia32_cmppd256: { - unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue(); - // If this one of the SSE immediates, we can use native IR. - if (CC < 8) { - FCmpInst::Predicate Pred; - switch (CC) { - case 0: Pred = FCmpInst::FCMP_OEQ; break; - case 1: Pred = FCmpInst::FCMP_OLT; break; - case 2: Pred = FCmpInst::FCMP_OLE; break; - case 3: Pred = FCmpInst::FCMP_UNO; break; - case 4: Pred = FCmpInst::FCMP_UNE; break; - case 5: Pred = FCmpInst::FCMP_UGE; break; - case 6: Pred = FCmpInst::FCMP_UGT; break; - case 7: Pred = FCmpInst::FCMP_ORD; break; - } - return getVectorFCmpIR(Pred); + case X86::BI__builtin_ia32_cmppd256: + case X86::BI__builtin_ia32_cmpps128_mask: + case X86::BI__builtin_ia32_cmpps256_mask: + case X86::BI__builtin_ia32_cmpps512_mask: + case X86::BI__builtin_ia32_cmppd128_mask: + case X86::BI__builtin_ia32_cmppd256_mask: + case X86::BI__builtin_ia32_cmppd512_mask: { + // Lowering vector comparisons to fcmp instructions, while + // ignoring signalling behaviour requested + // ignoring rounding mode requested + // This is is only possible as long as FENV_ACCESS is not implemented. + // See also: https://reviews.llvm.org/D45616 + + // The third argument is the comparison condition, and integer in the + // range [0, 31] + unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f; + + // Lowering to IR fcmp instruction. + // Ignoring requested signaling behaviour, + // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT. + FCmpInst::Predicate Pred; + switch (CC) { + case 0x00: Pred = FCmpInst::FCMP_OEQ; break; + case 0x01: Pred = FCmpInst::FCMP_OLT; break; + case 0x02: Pred = FCmpInst::FCMP_OLE; break; + case 0x03: Pred = FCmpInst::FCMP_UNO; break; + case 0x04: Pred = FCmpInst::FCMP_UNE; break; + case 0x05: Pred = FCmpInst::FCMP_UGE; break; + case 0x06: Pred = FCmpInst::FCMP_UGT; break; + case 0x07: Pred = FCmpInst::FCMP_ORD; break; + case 0x08: Pred = FCmpInst::FCMP_UEQ; break; + case 0x09: Pred = FCmpInst::FCMP_ULT; break; + case 0x0a: Pred = FCmpInst::FCMP_ULE; break; + case 0x0b: Pred = FCmpInst::FCMP_FALSE; break; + case 0x0c: Pred = FCmpInst::FCMP_ONE; break; + case 0x0d: Pred = FCmpInst::FCMP_OGE; break; + case 0x0e: Pred = FCmpInst::FCMP_OGT; break; + case 0x0f: Pred = FCmpInst::FCMP_TRUE; break; + case 0x10: Pred = FCmpInst::FCMP_OEQ; break; + case 0x11: Pred = FCmpInst::FCMP_OLT; break; + case 0x12: Pred = FCmpInst::FCMP_OLE; break; + case 0x13: Pred = FCmpInst::FCMP_UNO; break; + case 0x14: Pred = FCmpInst::FCMP_UNE; break; + case 0x15: Pred = FCmpInst::FCMP_UGE; break; + case 0x16: Pred = FCmpInst::FCMP_UGT; break; + case 0x17: Pred = FCmpInst::FCMP_ORD; break; + case 0x18: Pred = FCmpInst::FCMP_UEQ; break; + case 0x19: Pred = FCmpInst::FCMP_ULT; break; + case 0x1a: Pred = FCmpInst::FCMP_ULE; break; + case 0x1b: Pred = FCmpInst::FCMP_FALSE; break; + case 0x1c: Pred = FCmpInst::FCMP_ONE; break; + case 0x1d: Pred = FCmpInst::FCMP_OGE; break; + case 0x1e: Pred = FCmpInst::FCMP_OGT; break; + case 0x1f: Pred = FCmpInst::FCMP_TRUE; break; + default: llvm_unreachable("Unhandled CC"); } - // We can't handle 8-31 immediates with native IR, use the intrinsic. - // Except for predicates that create constants. - Intrinsic::ID ID; + // Builtins without the _mask suffix return a vector of integers + // of the same width as the input vectors switch (BuiltinID) { - default: llvm_unreachable("Unsupported intrinsic!"); - case X86::BI__builtin_ia32_cmpps: - ID = Intrinsic::x86_sse_cmp_ps; - break; - case X86::BI__builtin_ia32_cmpps256: - // _CMP_TRUE_UQ, _CMP_TRUE_US produce -1,-1... vector - // on any input and _CMP_FALSE_OQ, _CMP_FALSE_OS produce 0, 0... - if (CC == 0xf || CC == 0xb || CC == 0x1b || CC == 0x1f) { - Value *Constant = (CC == 0xf || CC == 0x1f) ? - llvm::Constant::getAllOnesValue(Builder.getInt32Ty()) : - llvm::Constant::getNullValue(Builder.getInt32Ty()); - Value *Vec = Builder.CreateVectorSplat( - Ops[0]->getType()->getVectorNumElements(), Constant); - return Builder.CreateBitCast(Vec, Ops[0]->getType()); - } - ID = Intrinsic::x86_avx_cmp_ps_256; - break; - case X86::BI__builtin_ia32_cmppd: - ID = Intrinsic::x86_sse2_cmp_pd; - break; - case X86::BI__builtin_ia32_cmppd256: - // _CMP_TRUE_UQ, _CMP_TRUE_US produce -1,-1... vector - // on any input and _CMP_FALSE_OQ, _CMP_FALSE_OS produce 0, 0... - if (CC == 0xf || CC == 0xb || CC == 0x1b || CC == 0x1f) { - Value *Constant = (CC == 0xf || CC == 0x1f) ? - llvm::Constant::getAllOnesValue(Builder.getInt64Ty()) : - llvm::Constant::getNullValue(Builder.getInt64Ty()); - Value *Vec = Builder.CreateVectorSplat( - Ops[0]->getType()->getVectorNumElements(), Constant); - return Builder.CreateBitCast(Vec, Ops[0]->getType()); - } - ID = Intrinsic::x86_avx_cmp_pd_256; - break; + case X86::BI__builtin_ia32_cmpps512_mask: + case X86::BI__builtin_ia32_cmppd512_mask: + case X86::BI__builtin_ia32_cmpps128_mask: + case X86::BI__builtin_ia32_cmpps256_mask: + case X86::BI__builtin_ia32_cmppd128_mask: + case X86::BI__builtin_ia32_cmppd256_mask: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]); + return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]); + } + default: + return getVectorFCmpIR(Pred); } - - return Builder.CreateCall(CGM.getIntrinsic(ID), Ops); } // SSE scalar comparison intrinsics @@ -8716,6 +10361,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, llvm::SyncScope::System); } + case X86::BI__shiftleft128: + case X86::BI__shiftright128: { + // FIXME: Once fshl/fshr no longer add an unneeded and and cmov, do this: + // llvm::Function *F = CGM.getIntrinsic( + // BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr, + // Int64Ty); + // Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty); + // return Builder.CreateCall(F, Ops); + llvm::Type *Int128Ty = Builder.getInt128Ty(); + Value *Val = Builder.CreateOr( + Builder.CreateShl(Builder.CreateZExt(Ops[1], Int128Ty), 64), + Builder.CreateZExt(Ops[0], Int128Ty)); + Value *Amt = Builder.CreateAnd(Builder.CreateZExt(Ops[2], Int128Ty), + llvm::ConstantInt::get(Int128Ty, 0x3f)); + Value *Res; + if (BuiltinID == X86::BI__shiftleft128) + Res = Builder.CreateLShr(Builder.CreateShl(Val, Amt), 64); + else + Res = Builder.CreateLShr(Val, Amt); + return Builder.CreateTrunc(Res, Int64Ty); + } case X86::BI_ReadWriteBarrier: case X86::BI_ReadBarrier: case X86::BI_WriteBarrier: { @@ -9199,19 +10865,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2)); Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2)); - // Element zero comes from the first input vector and element one comes from - // the second. The element indices within each vector are numbered in big - // endian order so the shuffle mask must be adjusted for this on little - // endian platforms (i.e. index is complemented and source vector reversed). - unsigned ElemIdx0; - unsigned ElemIdx1; - if (getTarget().isLittleEndian()) { - ElemIdx0 = (~Index & 1) + 2; - ElemIdx1 = (~Index & 2) >> 1; - } else { // BigEndian - ElemIdx0 = (Index & 2) >> 1; - ElemIdx1 = 2 + (Index & 1); - } + // Account for endianness by treating this as just a shuffle. So we use the + // same indices for both LE and BE in order to produce expected results in + // both cases. + unsigned ElemIdx0 = (Index & 2) >> 1; + unsigned ElemIdx1 = 2 + (Index & 1); Constant *ShuffleElts[2] = {ConstantInt::get(Int32Ty, ElemIdx0), ConstantInt::get(Int32Ty, ElemIdx1)}; @@ -9402,6 +11060,49 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, CI->setConvergent(); return CI; } + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { + llvm::SmallVector<llvm::Value *, 5> Args; + for (unsigned I = 0; I != 5; ++I) + Args.push_back(EmitScalarExpr(E->getArg(I))); + const llvm::Type *PtrTy = Args[0]->getType(); + // check pointer parameter + if (!PtrTy->isPointerTy() || + E->getArg(0) + ->getType() + ->getPointeeType() + .getQualifiers() + .getAddressSpace() != LangAS::opencl_local || + !PtrTy->getPointerElementType()->isFloatTy()) { + CGM.Error(E->getArg(0)->getLocStart(), + "parameter should have type \"local float*\""); + return nullptr; + } + // check float parameter + if (!Args[1]->getType()->isFloatTy()) { + CGM.Error(E->getArg(1)->getLocStart(), + "parameter should have type \"float\""); + return nullptr; + } + + Intrinsic::ID ID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + ID = Intrinsic::amdgcn_ds_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + ID = Intrinsic::amdgcn_ds_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + ID = Intrinsic::amdgcn_ds_fmax; + break; + default: + llvm_unreachable("Unknown BuiltinID"); + } + Value *F = CGM.getIntrinsic(ID); + return Builder.CreateCall(F, Args); + } // amdgcn workitem case AMDGPU::BI__builtin_amdgcn_workitem_id_x: @@ -10032,7 +11733,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__hmma_m16n16k16_ld_a: case NVPTX::BI__hmma_m16n16k16_ld_b: case NVPTX::BI__hmma_m16n16k16_ld_c_f16: - case NVPTX::BI__hmma_m16n16k16_ld_c_f32: { + case NVPTX::BI__hmma_m16n16k16_ld_c_f32: + case NVPTX::BI__hmma_m32n8k16_ld_a: + case NVPTX::BI__hmma_m32n8k16_ld_b: + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + case NVPTX::BI__hmma_m8n32k16_ld_a: + case NVPTX::BI__hmma_m8n32k16_ld_b: + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Value *Src = EmitScalarExpr(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -10044,31 +11753,70 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, unsigned NumResults; switch (BuiltinID) { case NVPTX::BI__hmma_m16n16k16_ld_a: - IID = isColMajor ? Intrinsic::nvvm_wmma_load_a_f16_col_stride - : Intrinsic::nvvm_wmma_load_a_f16_row_stride; + IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride; NumResults = 8; break; case NVPTX::BI__hmma_m16n16k16_ld_b: - IID = isColMajor ? Intrinsic::nvvm_wmma_load_b_f16_col_stride - : Intrinsic::nvvm_wmma_load_b_f16_row_stride; + IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride; NumResults = 8; break; case NVPTX::BI__hmma_m16n16k16_ld_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f16_col_stride - : Intrinsic::nvvm_wmma_load_c_f16_row_stride; + IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride; NumResults = 4; break; case NVPTX::BI__hmma_m16n16k16_ld_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f32_col_stride - : Intrinsic::nvvm_wmma_load_c_f32_row_stride; + IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m32n8k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m32n8k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride; + NumResults = 8; + break; + case NVPTX::BI__hmma_m8n32k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m8n32k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride; NumResults = 8; break; default: llvm_unreachable("Unexpected builtin ID."); } Value *Result = - Builder.CreateCall(CGM.getIntrinsic(IID), - {Builder.CreatePointerCast(Src, VoidPtrTy), Ldm}); + Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm}); // Save returned values. for (unsigned i = 0; i < NumResults; ++i) { @@ -10082,7 +11830,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, } case NVPTX::BI__hmma_m16n16k16_st_c_f16: - case NVPTX::BI__hmma_m16n16k16_st_c_f32: { + case NVPTX::BI__hmma_m16n16k16_st_c_f32: + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + case NVPTX::BI__hmma_m8n32k16_st_c_f32: { Value *Dst = EmitScalarExpr(E->getArg(0)); Address Src = EmitPointerWithAlignment(E->getArg(1)); Value *Ldm = EmitScalarExpr(E->getArg(2)); @@ -10096,21 +11848,38 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, // for some reason nvcc builtins use _c_. switch (BuiltinID) { case NVPTX::BI__hmma_m16n16k16_st_c_f16: - IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f16_col_stride - : Intrinsic::nvvm_wmma_store_d_f16_row_stride; + IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride; NumResults = 4; break; case NVPTX::BI__hmma_m16n16k16_st_c_f32: - IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f32_col_stride - : Intrinsic::nvvm_wmma_store_d_f32_row_stride; + IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride; + break; + case NVPTX::BI__hmma_m32n8k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m32n8k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride; + break; + case NVPTX::BI__hmma_m8n32k16_st_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride; + NumResults = 4; + break; + case NVPTX::BI__hmma_m8n32k16_st_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride + : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride; break; default: llvm_unreachable("Unexpected builtin ID."); } - Function *Intrinsic = CGM.getIntrinsic(IID); + Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType()); llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1); - SmallVector<Value *, 10> Values; - Values.push_back(Builder.CreatePointerCast(Dst, VoidPtrTy)); + SmallVector<Value *, 10> Values = {Dst}; for (unsigned i = 0; i < NumResults; ++i) { Value *V = Builder.CreateAlignedLoad( Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)), @@ -10122,12 +11891,20 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return Result; } - // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) - // --> Intrinsic::nvvm_wmma_mma_sync<layout A,B><DType><CType><Satf> + // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) --> + // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf> case NVPTX::BI__hmma_m16n16k16_mma_f16f16: case NVPTX::BI__hmma_m16n16k16_mma_f32f16: case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - case NVPTX::BI__hmma_m16n16k16_mma_f16f32: { + case NVPTX::BI__hmma_m16n16k16_mma_f16f32: + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: { Address Dst = EmitPointerWithAlignment(E->getArg(0)); Address SrcA = EmitPointerWithAlignment(E->getArg(1)); Address SrcB = EmitPointerWithAlignment(E->getArg(2)); @@ -10144,15 +11921,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, bool Satf = SatfArg.getSExtValue(); // clang-format off -#define MMA_VARIANTS(type) {{ \ - Intrinsic::nvvm_wmma_mma_sync_row_row_##type, \ - Intrinsic::nvvm_wmma_mma_sync_row_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_mma_sync_row_col_##type, \ - Intrinsic::nvvm_wmma_mma_sync_row_col_##type##_satfinite, \ - Intrinsic::nvvm_wmma_mma_sync_col_row_##type, \ - Intrinsic::nvvm_wmma_mma_sync_col_row_##type##_satfinite, \ - Intrinsic::nvvm_wmma_mma_sync_col_col_##type, \ - Intrinsic::nvvm_wmma_mma_sync_col_col_##type##_satfinite \ +#define MMA_VARIANTS(geom, type) {{ \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \ + Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \ }} // clang-format on @@ -10166,22 +11943,62 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, unsigned NumEltsD; switch (BuiltinID) { case NVPTX::BI__hmma_m16n16k16_mma_f16f16: - IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16)); NumEltsC = 4; NumEltsD = 4; break; case NVPTX::BI__hmma_m16n16k16_mma_f32f16: - IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16)); NumEltsC = 4; NumEltsD = 8; break; case NVPTX::BI__hmma_m16n16k16_mma_f16f32: - IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32)); NumEltsC = 8; NumEltsD = 4; break; case NVPTX::BI__hmma_m16n16k16_mma_f32f32: - IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32)); + IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m32n8k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32)); + NumEltsC = 8; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f16f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16)); + NumEltsC = 4; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f32f16: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16)); + NumEltsC = 4; + NumEltsD = 8; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f16f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32)); + NumEltsC = 8; + NumEltsD = 4; + break; + case NVPTX::BI__hmma_m8n32k16_mma_f32f32: + IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32)); NumEltsC = 8; NumEltsD = 8; break; @@ -10235,6 +12052,36 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { switch (BuiltinID) { + case WebAssembly::BI__builtin_wasm_memory_size: { + llvm::Type *ResultType = ConvertType(E->getType()); + Value *I = EmitScalarExpr(E->getArg(0)); + Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType); + return Builder.CreateCall(Callee, I); + } + case WebAssembly::BI__builtin_wasm_memory_grow: { + llvm::Type *ResultType = ConvertType(E->getType()); + Value *Args[] = { + EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)) + }; + Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType); + return Builder.CreateCall(Callee, Args); + } + case WebAssembly::BI__builtin_wasm_mem_size: { + llvm::Type *ResultType = ConvertType(E->getType()); + Value *I = EmitScalarExpr(E->getArg(0)); + Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_size, ResultType); + return Builder.CreateCall(Callee, I); + } + case WebAssembly::BI__builtin_wasm_mem_grow: { + llvm::Type *ResultType = ConvertType(E->getType()); + Value *Args[] = { + EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)) + }; + Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_grow, ResultType); + return Builder.CreateCall(Callee, Args); + } case WebAssembly::BI__builtin_wasm_current_memory: { llvm::Type *ResultType = ConvertType(E->getType()); Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType); @@ -10266,6 +12113,93 @@ Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID, SmallVector<llvm::Value *, 4> Ops; Intrinsic::ID ID = Intrinsic::not_intrinsic; + auto MakeCircLd = [&](unsigned IntID, bool HasImm) { + // The base pointer is passed by address, so it needs to be loaded. + Address BP = EmitPointerWithAlignment(E->getArg(0)); + BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy), + BP.getAlignment()); + llvm::Value *Base = Builder.CreateLoad(BP); + // Operands are Base, Increment, Modifier, Start. + if (HasImm) + Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3)) }; + else + Ops = { Base, EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)) }; + + llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops); + llvm::Value *NewBase = Builder.CreateExtractValue(Result, 1); + llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), + NewBase->getType()->getPointerTo()); + Address Dest = EmitPointerWithAlignment(E->getArg(0)); + // The intrinsic generates two results. The new value for the base pointer + // needs to be stored. + Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment()); + return Builder.CreateExtractValue(Result, 0); + }; + + auto MakeCircSt = [&](unsigned IntID, bool HasImm) { + // The base pointer is passed by address, so it needs to be loaded. + Address BP = EmitPointerWithAlignment(E->getArg(0)); + BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy), + BP.getAlignment()); + llvm::Value *Base = Builder.CreateLoad(BP); + // Operands are Base, Increment, Modifier, Value, Start. + if (HasImm) + Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3)), EmitScalarExpr(E->getArg(4)) }; + else + Ops = { Base, EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)) }; + + llvm::Value *NewBase = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops); + llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), + NewBase->getType()->getPointerTo()); + Address Dest = EmitPointerWithAlignment(E->getArg(0)); + // The intrinsic generates one result, which is the new value for the base + // pointer. It needs to be stored. + return Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment()); + }; + + // Handle the conversion of bit-reverse load intrinsics to bit code. + // The intrinsic call after this function only reads from memory and the + // write to memory is dealt by the store instruction. + auto MakeBrevLd = [&](unsigned IntID, llvm::Type *DestTy) { + // The intrinsic generates one result, which is the new value for the base + // pointer. It needs to be returned. The result of the load instruction is + // passed to intrinsic by address, so the value needs to be stored. + llvm::Value *BaseAddress = + Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy); + + // Expressions like &(*pt++) will be incremented per evaluation. + // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression + // per call. + Address DestAddr = EmitPointerWithAlignment(E->getArg(1)); + DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), Int8PtrTy), + DestAddr.getAlignment()); + llvm::Value *DestAddress = DestAddr.getPointer(); + + // Operands are Base, Dest, Modifier. + // The intrinsic format in LLVM IR is defined as + // { ValueType, i8* } (i8*, i32). + Ops = {BaseAddress, EmitScalarExpr(E->getArg(2))}; + + llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops); + // The value needs to be stored as the variable is passed by reference. + llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0); + + // The store needs to be truncated to fit the destination type. + // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs + // to be handled with stores of respective destination type. + DestVal = Builder.CreateTrunc(DestVal, DestTy); + + llvm::Value *DestForStore = + Builder.CreateBitCast(DestAddress, DestVal->getType()->getPointerTo()); + Builder.CreateAlignedStore(DestVal, DestForStore, DestAddr.getAlignment()); + // The updated value of the base pointer is returned. + return Builder.CreateExtractValue(Result, 1); + }; + switch (BuiltinID) { case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry: case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B: { @@ -10311,6 +12245,64 @@ Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID, Builder.CreateAlignedStore(Vprd, Base, Dest.getAlignment()); return Builder.CreateExtractValue(Result, 0); } + case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci: + return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci: + return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci: + return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci: + return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci: + return MakeCircLd(Intrinsic::hexagon_L2_loadri_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci: + return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr: + return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr: + return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr: + return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr: + return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr: + return MakeCircLd(Intrinsic::hexagon_L2_loadri_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr: + return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci: + return MakeCircSt(Intrinsic::hexagon_S2_storerb_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci: + return MakeCircSt(Intrinsic::hexagon_S2_storerh_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci: + return MakeCircSt(Intrinsic::hexagon_S2_storerf_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci: + return MakeCircSt(Intrinsic::hexagon_S2_storeri_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci: + return MakeCircSt(Intrinsic::hexagon_S2_storerd_pci, /*HasImm*/true); + case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr: + return MakeCircSt(Intrinsic::hexagon_S2_storerb_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr: + return MakeCircSt(Intrinsic::hexagon_S2_storerh_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr: + return MakeCircSt(Intrinsic::hexagon_S2_storerf_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr: + return MakeCircSt(Intrinsic::hexagon_S2_storeri_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr: + return MakeCircSt(Intrinsic::hexagon_S2_storerd_pcr, /*HasImm*/false); + case Hexagon::BI__builtin_brev_ldub: + return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty); + case Hexagon::BI__builtin_brev_ldb: + return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty); + case Hexagon::BI__builtin_brev_lduh: + return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty); + case Hexagon::BI__builtin_brev_ldh: + return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty); + case Hexagon::BI__builtin_brev_ldw: + return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty); + case Hexagon::BI__builtin_brev_ldd: + return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty); + default: + break; } // switch return nullptr; |
