summaryrefslogtreecommitdiffstats
path: root/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp')
-rw-r--r--gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp3078
1 files changed, 2535 insertions, 543 deletions
diff --git a/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp b/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp
index 35ae114c4f2..0770c204dab 100644
--- a/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/gnu/llvm/tools/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14,6 +14,7 @@
#include "CGCXXABI.h"
#include "CGObjCRuntime.h"
#include "CGOpenCLRuntime.h"
+#include "CGRecordLayout.h"
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "ConstantEmitter.h"
@@ -188,7 +189,7 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
return RValue::get(Result);
}
-/// @brief Utility to insert an atomic cmpxchg instruction.
+/// Utility to insert an atomic cmpxchg instruction.
///
/// @param CGF The current codegen function.
/// @param E Builtin call expression to convert to cmpxchg.
@@ -319,7 +320,7 @@ static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
}
-/// \brief Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
+/// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
/// depending on IntrinsicID.
///
/// \arg CGF The current codegen function.
@@ -384,7 +385,7 @@ EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
}
// The encompassing type must have a width greater than or equal to the width
- // of the specified types. Aditionally, if the encompassing type is signed,
+ // of the specified types. Additionally, if the encompassing type is signed,
// its width must be strictly greater than the width of any unsigned types
// given.
unsigned Width = 0;
@@ -478,13 +479,261 @@ CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
// LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
Value *Min = Builder.getInt1((Type & 2) != 0);
- // For GCC compatability, __builtin_object_size treat NULL as unknown size.
+ // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
Value *NullIsUnknown = Builder.getTrue();
return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown});
}
-// Many of MSVC builtins are on both x64 and ARM; to avoid repeating code, we
-// handle them here.
+namespace {
+/// A struct to generically desribe a bit test intrinsic.
+struct BitTest {
+ enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set };
+ enum InterlockingKind : uint8_t {
+ Unlocked,
+ Sequential,
+ Acquire,
+ Release,
+ NoFence
+ };
+
+ ActionKind Action;
+ InterlockingKind Interlocking;
+ bool Is64Bit;
+
+ static BitTest decodeBitTestBuiltin(unsigned BuiltinID);
+};
+} // namespace
+
+BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) {
+ switch (BuiltinID) {
+ // Main portable variants.
+ case Builtin::BI_bittest:
+ return {TestOnly, Unlocked, false};
+ case Builtin::BI_bittestandcomplement:
+ return {Complement, Unlocked, false};
+ case Builtin::BI_bittestandreset:
+ return {Reset, Unlocked, false};
+ case Builtin::BI_bittestandset:
+ return {Set, Unlocked, false};
+ case Builtin::BI_interlockedbittestandreset:
+ return {Reset, Sequential, false};
+ case Builtin::BI_interlockedbittestandset:
+ return {Set, Sequential, false};
+
+ // X86-specific 64-bit variants.
+ case Builtin::BI_bittest64:
+ return {TestOnly, Unlocked, true};
+ case Builtin::BI_bittestandcomplement64:
+ return {Complement, Unlocked, true};
+ case Builtin::BI_bittestandreset64:
+ return {Reset, Unlocked, true};
+ case Builtin::BI_bittestandset64:
+ return {Set, Unlocked, true};
+ case Builtin::BI_interlockedbittestandreset64:
+ return {Reset, Sequential, true};
+ case Builtin::BI_interlockedbittestandset64:
+ return {Set, Sequential, true};
+
+ // ARM/AArch64-specific ordering variants.
+ case Builtin::BI_interlockedbittestandset_acq:
+ return {Set, Acquire, false};
+ case Builtin::BI_interlockedbittestandset_rel:
+ return {Set, Release, false};
+ case Builtin::BI_interlockedbittestandset_nf:
+ return {Set, NoFence, false};
+ case Builtin::BI_interlockedbittestandreset_acq:
+ return {Reset, Acquire, false};
+ case Builtin::BI_interlockedbittestandreset_rel:
+ return {Reset, Release, false};
+ case Builtin::BI_interlockedbittestandreset_nf:
+ return {Reset, NoFence, false};
+ }
+ llvm_unreachable("expected only bittest intrinsics");
+}
+
+static char bitActionToX86BTCode(BitTest::ActionKind A) {
+ switch (A) {
+ case BitTest::TestOnly: return '\0';
+ case BitTest::Complement: return 'c';
+ case BitTest::Reset: return 'r';
+ case BitTest::Set: return 's';
+ }
+ llvm_unreachable("invalid action");
+}
+
+static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
+ BitTest BT,
+ const CallExpr *E, Value *BitBase,
+ Value *BitPos) {
+ char Action = bitActionToX86BTCode(BT.Action);
+ char SizeSuffix = BT.Is64Bit ? 'q' : 'l';
+
+ // Build the assembly.
+ SmallString<64> Asm;
+ raw_svector_ostream AsmOS(Asm);
+ if (BT.Interlocking != BitTest::Unlocked)
+ AsmOS << "lock ";
+ AsmOS << "bt";
+ if (Action)
+ AsmOS << Action;
+ AsmOS << SizeSuffix << " $2, ($1)\n\tsetc ${0:b}";
+
+ // Build the constraints. FIXME: We should support immediates when possible.
+ std::string Constraints = "=r,r,r,~{cc},~{flags},~{fpsr}";
+ llvm::IntegerType *IntType = llvm::IntegerType::get(
+ CGF.getLLVMContext(),
+ CGF.getContext().getTypeSize(E->getArg(1)->getType()));
+ llvm::Type *IntPtrType = IntType->getPointerTo();
+ llvm::FunctionType *FTy =
+ llvm::FunctionType::get(CGF.Int8Ty, {IntPtrType, IntType}, false);
+
+ llvm::InlineAsm *IA =
+ llvm::InlineAsm::get(FTy, Asm, Constraints, /*SideEffects=*/true);
+ return CGF.Builder.CreateCall(IA, {BitBase, BitPos});
+}
+
+static llvm::AtomicOrdering
+getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
+ switch (I) {
+ case BitTest::Unlocked: return llvm::AtomicOrdering::NotAtomic;
+ case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent;
+ case BitTest::Acquire: return llvm::AtomicOrdering::Acquire;
+ case BitTest::Release: return llvm::AtomicOrdering::Release;
+ case BitTest::NoFence: return llvm::AtomicOrdering::Monotonic;
+ }
+ llvm_unreachable("invalid interlocking");
+}
+
+/// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
+/// bits and a bit position and read and optionally modify the bit at that
+/// position. The position index can be arbitrarily large, i.e. it can be larger
+/// than 31 or 63, so we need an indexed load in the general case.
+static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
+ unsigned BuiltinID,
+ const CallExpr *E) {
+ Value *BitBase = CGF.EmitScalarExpr(E->getArg(0));
+ Value *BitPos = CGF.EmitScalarExpr(E->getArg(1));
+
+ BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID);
+
+ // X86 has special BT, BTC, BTR, and BTS instructions that handle the array
+ // indexing operation internally. Use them if possible.
+ llvm::Triple::ArchType Arch = CGF.getTarget().getTriple().getArch();
+ if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64)
+ return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos);
+
+ // Otherwise, use generic code to load one byte and test the bit. Use all but
+ // the bottom three bits as the array index, and the bottom three bits to form
+ // a mask.
+ // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
+ Value *ByteIndex = CGF.Builder.CreateAShr(
+ BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
+ Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
+ Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
+ ByteIndex, "bittest.byteaddr"),
+ CharUnits::One());
+ Value *PosLow =
+ CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
+ llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
+
+ // The updating instructions will need a mask.
+ Value *Mask = nullptr;
+ if (BT.Action != BitTest::TestOnly) {
+ Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow,
+ "bittest.mask");
+ }
+
+ // Check the action and ordering of the interlocked intrinsics.
+ llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking);
+
+ Value *OldByte = nullptr;
+ if (Ordering != llvm::AtomicOrdering::NotAtomic) {
+ // Emit a combined atomicrmw load/store operation for the interlocked
+ // intrinsics.
+ llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or;
+ if (BT.Action == BitTest::Reset) {
+ Mask = CGF.Builder.CreateNot(Mask);
+ RMWOp = llvm::AtomicRMWInst::And;
+ }
+ OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr.getPointer(), Mask,
+ Ordering);
+ } else {
+ // Emit a plain load for the non-interlocked intrinsics.
+ OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte");
+ Value *NewByte = nullptr;
+ switch (BT.Action) {
+ case BitTest::TestOnly:
+ // Don't store anything.
+ break;
+ case BitTest::Complement:
+ NewByte = CGF.Builder.CreateXor(OldByte, Mask);
+ break;
+ case BitTest::Reset:
+ NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask));
+ break;
+ case BitTest::Set:
+ NewByte = CGF.Builder.CreateOr(OldByte, Mask);
+ break;
+ }
+ if (NewByte)
+ CGF.Builder.CreateStore(NewByte, ByteAddr);
+ }
+
+ // However we loaded the old byte, either by plain load or atomicrmw, shift
+ // the bit into the low position and mask it to 0 or 1.
+ Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr");
+ return CGF.Builder.CreateAnd(
+ ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res");
+}
+
+namespace {
+enum class MSVCSetJmpKind {
+ _setjmpex,
+ _setjmp3,
+ _setjmp
+};
+}
+
+/// MSVC handles setjmp a bit differently on different platforms. On every
+/// architecture except 32-bit x86, the frame address is passed. On x86, extra
+/// parameters can be passed as variadic arguments, but we always pass none.
+static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind,
+ const CallExpr *E) {
+ llvm::Value *Arg1 = nullptr;
+ llvm::Type *Arg1Ty = nullptr;
+ StringRef Name;
+ bool IsVarArg = false;
+ if (SJKind == MSVCSetJmpKind::_setjmp3) {
+ Name = "_setjmp3";
+ Arg1Ty = CGF.Int32Ty;
+ Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0);
+ IsVarArg = true;
+ } else {
+ Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex";
+ Arg1Ty = CGF.Int8PtrTy;
+ Arg1 = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(Intrinsic::frameaddress),
+ llvm::ConstantInt::get(CGF.Int32Ty, 0));
+ }
+
+ // Mark the call site and declaration with ReturnsTwice.
+ llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty};
+ llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
+ CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex,
+ llvm::Attribute::ReturnsTwice);
+ llvm::Constant *SetJmpFn = CGF.CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name,
+ ReturnsTwiceAttr, /*Local=*/true);
+
+ llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast(
+ CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy);
+ llvm::Value *Args[] = {Buf, Arg1};
+ llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args);
+ CS.setAttributes(ReturnsTwiceAttr);
+ return RValue::get(CS.getInstruction());
+}
+
+// Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code,
+// we handle them here.
enum class CodeGenFunction::MSVCIntrin {
_BitScanForward,
_BitScanReverse,
@@ -496,7 +745,6 @@ enum class CodeGenFunction::MSVCIntrin {
_InterlockedIncrement,
_InterlockedOr,
_InterlockedXor,
- _interlockedbittestandset,
__fastfail,
};
@@ -564,22 +812,6 @@ Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
case MSVCIntrin::_InterlockedXor:
return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
- case MSVCIntrin::_interlockedbittestandset: {
- llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
- llvm::Value *Bit = EmitScalarExpr(E->getArg(1));
- AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
- AtomicRMWInst::Or, Addr,
- Builder.CreateShl(ConstantInt::get(Bit->getType(), 1), Bit),
- llvm::AtomicOrdering::SequentiallyConsistent);
- // Shift the relevant bit to the least significant position, truncate to
- // the result type, and test the low bit.
- llvm::Value *Shifted = Builder.CreateLShr(RMWI, Bit);
- llvm::Value *Truncated =
- Builder.CreateTrunc(Shifted, ConvertType(E->getType()));
- return Builder.CreateAnd(Truncated,
- ConstantInt::get(Truncated->getType(), 1));
- }
-
case MSVCIntrin::_InterlockedDecrement: {
llvm::Type *IntTy = ConvertType(E->getType());
AtomicRMWInst *RMWI = Builder.CreateAtomicRMW(
@@ -930,6 +1162,96 @@ EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
return RValue::get(Overflow);
}
+static llvm::Value *dumpRecord(CodeGenFunction &CGF, QualType RType,
+ Value *&RecordPtr, CharUnits Align, Value *Func,
+ int Lvl) {
+ const auto *RT = RType->getAs<RecordType>();
+ ASTContext &Context = CGF.getContext();
+ RecordDecl *RD = RT->getDecl()->getDefinition();
+ ASTContext &Ctx = RD->getASTContext();
+ const ASTRecordLayout &RL = Ctx.getASTRecordLayout(RD);
+ std::string Pad = std::string(Lvl * 4, ' ');
+
+ Value *GString =
+ CGF.Builder.CreateGlobalStringPtr(RType.getAsString() + " {\n");
+ Value *Res = CGF.Builder.CreateCall(Func, {GString});
+
+ static llvm::DenseMap<QualType, const char *> Types;
+ if (Types.empty()) {
+ Types[Context.CharTy] = "%c";
+ Types[Context.BoolTy] = "%d";
+ Types[Context.SignedCharTy] = "%hhd";
+ Types[Context.UnsignedCharTy] = "%hhu";
+ Types[Context.IntTy] = "%d";
+ Types[Context.UnsignedIntTy] = "%u";
+ Types[Context.LongTy] = "%ld";
+ Types[Context.UnsignedLongTy] = "%lu";
+ Types[Context.LongLongTy] = "%lld";
+ Types[Context.UnsignedLongLongTy] = "%llu";
+ Types[Context.ShortTy] = "%hd";
+ Types[Context.UnsignedShortTy] = "%hu";
+ Types[Context.VoidPtrTy] = "%p";
+ Types[Context.FloatTy] = "%f";
+ Types[Context.DoubleTy] = "%f";
+ Types[Context.LongDoubleTy] = "%Lf";
+ Types[Context.getPointerType(Context.CharTy)] = "%s";
+ Types[Context.getPointerType(Context.getConstType(Context.CharTy))] = "%s";
+ }
+
+ for (const auto *FD : RD->fields()) {
+ uint64_t Off = RL.getFieldOffset(FD->getFieldIndex());
+ Off = Ctx.toCharUnitsFromBits(Off).getQuantity();
+
+ Value *FieldPtr = RecordPtr;
+ if (RD->isUnion())
+ FieldPtr = CGF.Builder.CreatePointerCast(
+ FieldPtr, CGF.ConvertType(Context.getPointerType(FD->getType())));
+ else
+ FieldPtr = CGF.Builder.CreateStructGEP(CGF.ConvertType(RType), FieldPtr,
+ FD->getFieldIndex());
+
+ GString = CGF.Builder.CreateGlobalStringPtr(
+ llvm::Twine(Pad)
+ .concat(FD->getType().getAsString())
+ .concat(llvm::Twine(' '))
+ .concat(FD->getNameAsString())
+ .concat(" : ")
+ .str());
+ Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
+ Res = CGF.Builder.CreateAdd(Res, TmpRes);
+
+ QualType CanonicalType =
+ FD->getType().getUnqualifiedType().getCanonicalType();
+
+ // We check whether we are in a recursive type
+ if (CanonicalType->isRecordType()) {
+ Value *TmpRes =
+ dumpRecord(CGF, CanonicalType, FieldPtr, Align, Func, Lvl + 1);
+ Res = CGF.Builder.CreateAdd(TmpRes, Res);
+ continue;
+ }
+
+ // We try to determine the best format to print the current field
+ llvm::Twine Format = Types.find(CanonicalType) == Types.end()
+ ? Types[Context.VoidPtrTy]
+ : Types[CanonicalType];
+
+ Address FieldAddress = Address(FieldPtr, Align);
+ FieldPtr = CGF.Builder.CreateLoad(FieldAddress);
+
+ // FIXME Need to handle bitfield here
+ GString = CGF.Builder.CreateGlobalStringPtr(
+ Format.concat(llvm::Twine('\n')).str());
+ TmpRes = CGF.Builder.CreateCall(Func, {GString, FieldPtr});
+ Res = CGF.Builder.CreateAdd(Res, TmpRes);
+ }
+
+ GString = CGF.Builder.CreateGlobalStringPtr(Pad + "}\n");
+ Value *TmpRes = CGF.Builder.CreateCall(Func, {GString});
+ Res = CGF.Builder.CreateAdd(Res, TmpRes);
+ return Res;
+}
+
RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
unsigned BuiltinID, const CallExpr *E,
ReturnValueSlot ReturnValue) {
@@ -966,6 +1288,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__builtin_copysign:
case Builtin::BI__builtin_copysignf:
case Builtin::BI__builtin_copysignl:
+ case Builtin::BI__builtin_copysignf128:
return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
case Builtin::BIcos:
@@ -998,6 +1321,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__builtin_fabs:
case Builtin::BI__builtin_fabsf:
case Builtin::BI__builtin_fabsl:
+ case Builtin::BI__builtin_fabsf128:
return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
case Builtin::BIfloor:
@@ -1158,16 +1482,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__builtin_abs:
case Builtin::BI__builtin_labs:
case Builtin::BI__builtin_llabs: {
+ // X < 0 ? -X : X
+ // The negation has 'nsw' because abs of INT_MIN is undefined.
Value *ArgValue = EmitScalarExpr(E->getArg(0));
-
- Value *NegOp = Builder.CreateNeg(ArgValue, "neg");
- Value *CmpResult =
- Builder.CreateICmpSGE(ArgValue,
- llvm::Constant::getNullValue(ArgValue->getType()),
- "abscond");
- Value *Result =
- Builder.CreateSelect(CmpResult, ArgValue, NegOp, "abs");
-
+ Value *NegOp = Builder.CreateNSWNeg(ArgValue, "neg");
+ Constant *Zero = llvm::Constant::getNullValue(ArgValue->getType());
+ Value *CmpResult = Builder.CreateICmpSLT(ArgValue, Zero, "abscond");
+ Value *Result = Builder.CreateSelect(CmpResult, NegOp, ArgValue, "abs");
return RValue::get(Result);
}
case Builtin::BI__builtin_conj:
@@ -1194,6 +1515,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
return RValue::get(ComplexVal.first);
}
+ case Builtin::BI__builtin_dump_struct: {
+ Value *Func = EmitScalarExpr(E->getArg(1)->IgnoreImpCasts());
+ CharUnits Arg0Align = EmitPointerWithAlignment(E->getArg(0)).getAlignment();
+
+ const Expr *Arg0 = E->getArg(0)->IgnoreImpCasts();
+ QualType Arg0Type = Arg0->getType()->getPointeeType();
+
+ Value *RecordPtr = EmitScalarExpr(Arg0);
+ Value *Res = dumpRecord(*this, Arg0Type, RecordPtr, Arg0Align, Func, 0);
+ return RValue::get(Res);
+ }
+
case Builtin::BI__builtin_cimag:
case Builtin::BI__builtin_cimagf:
case Builtin::BI__builtin_cimagl:
@@ -1304,20 +1637,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
llvm::Type *ArgType = Val->getType();
Shift = Builder.CreateIntCast(Shift, ArgType, false);
- unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
- Value *ArgTypeSize = llvm::ConstantInt::get(ArgType, ArgWidth);
- Value *ArgZero = llvm::Constant::getNullValue(ArgType);
-
+ unsigned ArgWidth = ArgType->getIntegerBitWidth();
Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1);
- Shift = Builder.CreateAnd(Shift, Mask);
- Value *LeftShift = Builder.CreateSub(ArgTypeSize, Shift);
-
- Value *RightShifted = Builder.CreateLShr(Val, Shift);
- Value *LeftShifted = Builder.CreateShl(Val, LeftShift);
- Value *Rotated = Builder.CreateOr(LeftShifted, RightShifted);
- Value *ShiftIsZero = Builder.CreateICmpEQ(Shift, ArgZero);
- Value *Result = Builder.CreateSelect(ShiftIsZero, Val, Rotated);
+ Value *RightShiftAmt = Builder.CreateAnd(Shift, Mask);
+ Value *RightShifted = Builder.CreateLShr(Val, RightShiftAmt);
+ Value *LeftShiftAmt = Builder.CreateAnd(Builder.CreateNeg(Shift), Mask);
+ Value *LeftShifted = Builder.CreateShl(Val, LeftShiftAmt);
+ Value *Result = Builder.CreateOr(LeftShifted, RightShifted);
return RValue::get(Result);
}
case Builtin::BI_rotl8:
@@ -1330,20 +1657,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
llvm::Type *ArgType = Val->getType();
Shift = Builder.CreateIntCast(Shift, ArgType, false);
- unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
- Value *ArgTypeSize = llvm::ConstantInt::get(ArgType, ArgWidth);
- Value *ArgZero = llvm::Constant::getNullValue(ArgType);
-
+ unsigned ArgWidth = ArgType->getIntegerBitWidth();
Value *Mask = llvm::ConstantInt::get(ArgType, ArgWidth - 1);
- Shift = Builder.CreateAnd(Shift, Mask);
- Value *RightShift = Builder.CreateSub(ArgTypeSize, Shift);
-
- Value *LeftShifted = Builder.CreateShl(Val, Shift);
- Value *RightShifted = Builder.CreateLShr(Val, RightShift);
- Value *Rotated = Builder.CreateOr(LeftShifted, RightShifted);
- Value *ShiftIsZero = Builder.CreateICmpEQ(Shift, ArgZero);
- Value *Result = Builder.CreateSelect(ShiftIsZero, Val, Rotated);
+ Value *LeftShiftAmt = Builder.CreateAnd(Shift, Mask);
+ Value *LeftShifted = Builder.CreateShl(Val, LeftShiftAmt);
+ Value *RightShiftAmt = Builder.CreateAnd(Builder.CreateNeg(Shift), Mask);
+ Value *RightShifted = Builder.CreateLShr(Val, RightShiftAmt);
+ Value *Result = Builder.CreateOr(LeftShifted, RightShifted);
return RValue::get(Result);
}
case Builtin::BI__builtin_unpredictable: {
@@ -1739,6 +2060,63 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
return RValue::get(Dest.getPointer());
}
+ case Builtin::BI__builtin_wmemcmp: {
+ // The MSVC runtime library does not provide a definition of wmemcmp, so we
+ // need an inline implementation.
+ if (!getTarget().getTriple().isOSMSVCRT())
+ break;
+
+ llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
+
+ Value *Dst = EmitScalarExpr(E->getArg(0));
+ Value *Src = EmitScalarExpr(E->getArg(1));
+ Value *Size = EmitScalarExpr(E->getArg(2));
+
+ BasicBlock *Entry = Builder.GetInsertBlock();
+ BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
+ BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
+ BasicBlock *Next = createBasicBlock("wmemcmp.next");
+ BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
+ Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
+ Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
+
+ EmitBlock(CmpGT);
+ PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
+ DstPhi->addIncoming(Dst, Entry);
+ PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
+ SrcPhi->addIncoming(Src, Entry);
+ PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
+ SizePhi->addIncoming(Size, Entry);
+ CharUnits WCharAlign =
+ getContext().getTypeAlignInChars(getContext().WCharTy);
+ Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
+ Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
+ Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
+ Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
+
+ EmitBlock(CmpLT);
+ Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
+ Builder.CreateCondBr(DstLtSrc, Exit, Next);
+
+ EmitBlock(Next);
+ Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
+ Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
+ Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
+ Value *NextSizeEq0 =
+ Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
+ Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
+ DstPhi->addIncoming(NextDst, Next);
+ SrcPhi->addIncoming(NextSrc, Next);
+ SizePhi->addIncoming(NextSize, Next);
+
+ EmitBlock(Exit);
+ PHINode *Ret = Builder.CreatePHI(IntTy, 4);
+ Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
+ Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
+ Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
+ Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
+ return RValue::get(Ret);
+ }
case Builtin::BI__builtin_dwarf_cfa: {
// The offset in bytes from the first argument to the CFA.
//
@@ -2037,7 +2415,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__sync_synchronize: {
// We assume this is supposed to correspond to a C++0x-style
// sequentially-consistent fence (i.e. this is only usable for
- // synchonization, not device I/O or anything like that). This intrinsic
+ // synchronization, not device I/O or anything like that). This intrinsic
// is really badly designed in the sense that in theory, there isn't
// any way to safely use it... but in practice, it mostly works
// to use it with non-atomic loads and stores to get acquire/release
@@ -2552,11 +2930,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__builtin_addressof:
return RValue::get(EmitLValue(E->getArg(0)).getPointer());
case Builtin::BI__builtin_operator_new:
- return EmitBuiltinNewDeleteCall(FD->getType()->castAs<FunctionProtoType>(),
- E->getArg(0), false);
+ return EmitBuiltinNewDeleteCall(
+ E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
case Builtin::BI__builtin_operator_delete:
- return EmitBuiltinNewDeleteCall(FD->getType()->castAs<FunctionProtoType>(),
- E->getArg(0), true);
+ return EmitBuiltinNewDeleteCall(
+ E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
+
case Builtin::BI__noop:
// __noop always evaluates to an integer literal zero.
return RValue::get(ConstantInt::get(IntTy, 0));
@@ -2643,9 +3022,26 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI_InterlockedXor16:
case Builtin::BI_InterlockedXor:
return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
+
+ case Builtin::BI_bittest64:
+ case Builtin::BI_bittest:
+ case Builtin::BI_bittestandcomplement64:
+ case Builtin::BI_bittestandcomplement:
+ case Builtin::BI_bittestandreset64:
+ case Builtin::BI_bittestandreset:
+ case Builtin::BI_bittestandset64:
+ case Builtin::BI_bittestandset:
+ case Builtin::BI_interlockedbittestandreset:
+ case Builtin::BI_interlockedbittestandreset64:
+ case Builtin::BI_interlockedbittestandset64:
case Builtin::BI_interlockedbittestandset:
- return RValue::get(
- EmitMSVCBuiltinExpr(MSVCIntrin::_interlockedbittestandset, E));
+ case Builtin::BI_interlockedbittestandset_acq:
+ case Builtin::BI_interlockedbittestandset_rel:
+ case Builtin::BI_interlockedbittestandset_nf:
+ case Builtin::BI_interlockedbittestandreset_acq:
+ case Builtin::BI_interlockedbittestandreset_rel:
+ case Builtin::BI_interlockedbittestandreset_nf:
+ return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E));
case Builtin::BI__exception_code:
case Builtin::BI_exception_code:
@@ -2656,59 +3052,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__abnormal_termination:
case Builtin::BI_abnormal_termination:
return RValue::get(EmitSEHAbnormalTermination());
- case Builtin::BI_setjmpex: {
- if (getTarget().getTriple().isOSMSVCRT()) {
- llvm::Type *ArgTypes[] = {Int8PtrTy, Int8PtrTy};
- llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
- getLLVMContext(), llvm::AttributeList::FunctionIndex,
- llvm::Attribute::ReturnsTwice);
- llvm::Constant *SetJmpEx = CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/false),
- "_setjmpex", ReturnsTwiceAttr, /*Local=*/true);
- llvm::Value *Buf = Builder.CreateBitOrPointerCast(
- EmitScalarExpr(E->getArg(0)), Int8PtrTy);
- llvm::Value *FrameAddr =
- Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
- ConstantInt::get(Int32Ty, 0));
- llvm::Value *Args[] = {Buf, FrameAddr};
- llvm::CallSite CS = EmitRuntimeCallOrInvoke(SetJmpEx, Args);
- CS.setAttributes(ReturnsTwiceAttr);
- return RValue::get(CS.getInstruction());
- }
+ case Builtin::BI_setjmpex:
+ if (getTarget().getTriple().isOSMSVCRT())
+ return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
break;
- }
- case Builtin::BI_setjmp: {
+ case Builtin::BI_setjmp:
if (getTarget().getTriple().isOSMSVCRT()) {
- llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
- getLLVMContext(), llvm::AttributeList::FunctionIndex,
- llvm::Attribute::ReturnsTwice);
- llvm::Value *Buf = Builder.CreateBitOrPointerCast(
- EmitScalarExpr(E->getArg(0)), Int8PtrTy);
- llvm::CallSite CS;
- if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
- llvm::Type *ArgTypes[] = {Int8PtrTy, IntTy};
- llvm::Constant *SetJmp3 = CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/true),
- "_setjmp3", ReturnsTwiceAttr, /*Local=*/true);
- llvm::Value *Count = ConstantInt::get(IntTy, 0);
- llvm::Value *Args[] = {Buf, Count};
- CS = EmitRuntimeCallOrInvoke(SetJmp3, Args);
- } else {
- llvm::Type *ArgTypes[] = {Int8PtrTy, Int8PtrTy};
- llvm::Constant *SetJmp = CGM.CreateRuntimeFunction(
- llvm::FunctionType::get(IntTy, ArgTypes, /*isVarArg=*/false),
- "_setjmp", ReturnsTwiceAttr, /*Local=*/true);
- llvm::Value *FrameAddr =
- Builder.CreateCall(CGM.getIntrinsic(Intrinsic::frameaddress),
- ConstantInt::get(Int32Ty, 0));
- llvm::Value *Args[] = {Buf, FrameAddr};
- CS = EmitRuntimeCallOrInvoke(SetJmp, Args);
- }
- CS.setAttributes(ReturnsTwiceAttr);
- return RValue::get(CS.getInstruction());
+ if (getTarget().getTriple().getArch() == llvm::Triple::x86)
+ return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
+ else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
+ return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
+ return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E);
}
break;
- }
case Builtin::BI__GetExceptionInfo: {
if (llvm::GlobalVariable *GV =
@@ -2736,6 +3092,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
return EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
case Builtin::BI__builtin_coro_frame:
return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
+ case Builtin::BI__builtin_coro_noop:
+ return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
case Builtin::BI__builtin_coro_free:
return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
case Builtin::BI__builtin_coro_destroy:
@@ -2886,11 +3244,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
// OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
case Builtin::BIget_pipe_num_packets:
case Builtin::BIget_pipe_max_packets: {
- const char *Name;
+ const char *BaseName;
+ const PipeType *PipeTy = E->getArg(0)->getType()->getAs<PipeType>();
if (BuiltinID == Builtin::BIget_pipe_num_packets)
- Name = "__get_pipe_num_packets";
+ BaseName = "__get_pipe_num_packets";
else
- Name = "__get_pipe_max_packets";
+ BaseName = "__get_pipe_max_packets";
+ auto Name = std::string(BaseName) +
+ std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
// Building the generic function prototype.
Value *Arg0 = EmitScalarExpr(E->getArg(0));
@@ -2996,10 +3357,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
return Ptr;
};
- // Could have events and/or vaargs.
+ // Could have events and/or varargs.
if (E->getArg(3)->getType()->isBlockPointerType()) {
// No events passed, but has variadic arguments.
- Name = "__enqueue_kernel_vaargs";
+ Name = "__enqueue_kernel_varargs";
auto Info =
CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
llvm::Value *Kernel =
@@ -3067,7 +3428,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
// Pass the number of variadics to the runtime function too.
Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
ArgTys.push_back(Int32Ty);
- Name = "__enqueue_kernel_events_vaargs";
+ Name = "__enqueue_kernel_events_varargs";
auto *PtrToSizeArray = CreateArrayForSizeVar(7);
Args.push_back(PtrToSizeArray);
@@ -3108,7 +3469,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
CGM.CreateRuntimeFunction(
llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
false),
- "__get_kernel_preferred_work_group_multiple_impl"),
+ "__get_kernel_preferred_work_group_size_multiple_impl"),
{Kernel, Arg}));
}
case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
@@ -3179,6 +3540,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
case Builtin::BI__xray_customevent: {
if (!ShouldXRayInstrumentFunction())
return RValue::getIgnored();
+
+ if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
+ XRayInstrKind::Custom))
+ return RValue::getIgnored();
+
if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
return RValue::getIgnored();
@@ -3202,6 +3568,44 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
}
+ case Builtin::BI__xray_typedevent: {
+ // TODO: There should be a way to always emit events even if the current
+ // function is not instrumented. Losing events in a stream can cripple
+ // a trace.
+ if (!ShouldXRayInstrumentFunction())
+ return RValue::getIgnored();
+
+ if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
+ XRayInstrKind::Typed))
+ return RValue::getIgnored();
+
+ if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
+ if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
+ return RValue::getIgnored();
+
+ Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
+ auto FTy = F->getFunctionType();
+ auto Arg0 = EmitScalarExpr(E->getArg(0));
+ auto PTy0 = FTy->getParamType(0);
+ if (PTy0 != Arg0->getType())
+ Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
+ auto Arg1 = E->getArg(1);
+ auto Arg1Val = EmitScalarExpr(Arg1);
+ auto Arg1Ty = Arg1->getType();
+ auto PTy1 = FTy->getParamType(1);
+ if (PTy1 != Arg1Val->getType()) {
+ if (Arg1Ty->isArrayType())
+ Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
+ else
+ Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
+ }
+ auto Arg2 = EmitScalarExpr(E->getArg(2));
+ auto PTy2 = FTy->getParamType(2);
+ if (PTy2 != Arg2->getType())
+ Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
+ return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
+ }
+
case Builtin::BI__builtin_ms_va_start:
case Builtin::BI__builtin_ms_va_end:
return RValue::get(
@@ -3250,6 +3654,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
// can move this up to the beginning of the function.
checkTargetFeatures(E, FD);
+ if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
+ LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
+
// See if we have a target specific intrinsic.
const char *Name = getContext().BuiltinInfo.getName(BuiltinID);
Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
@@ -3257,7 +3664,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
if (!Prefix.empty()) {
IntrinsicID = Intrinsic::getIntrinsicForGCCBuiltin(Prefix.data(), Name);
- // NOTE we dont need to perform a compatibility flag check here since the
+ // NOTE we don't need to perform a compatibility flag check here since the
// intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
// MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
if (IntrinsicID == Intrinsic::not_intrinsic)
@@ -3382,7 +3789,7 @@ Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
NeonTypeFlags TypeFlags,
- llvm::Triple::ArchType Arch,
+ bool HasLegalHalfType=true,
bool V1Ty=false) {
int IsQuad = TypeFlags.isQuad();
switch (TypeFlags.getEltType()) {
@@ -3393,9 +3800,7 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
case NeonTypeFlags::Poly16:
return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
case NeonTypeFlags::Float16:
- // FIXME: Only AArch64 backend can so far properly handle half types.
- // Remove else part once ARM backend support for half is complete.
- if (Arch == llvm::Triple::aarch64)
+ if (HasLegalHalfType)
return llvm::VectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
else
return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
@@ -3458,7 +3863,7 @@ Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
return ConstantInt::get(Ty, neg ? -SV : SV);
}
-// \brief Right-shift a vector by a constant.
+// Right-shift a vector by a constant.
Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
llvm::Type *Ty, bool usgn,
const char *name) {
@@ -3561,13 +3966,24 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
+ NEONMAP0(vceqz_v),
+ NEONMAP0(vceqzq_v),
+ NEONMAP0(vcgez_v),
+ NEONMAP0(vcgezq_v),
+ NEONMAP0(vcgtz_v),
+ NEONMAP0(vcgtzq_v),
+ NEONMAP0(vclez_v),
+ NEONMAP0(vclezq_v),
NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
+ NEONMAP0(vcltz_v),
+ NEONMAP0(vcltzq_v),
NEONMAP1(vclz_v, ctlz, Add1ArgType),
NEONMAP1(vclzq_v, ctlz, Add1ArgType),
NEONMAP1(vcnt_v, ctpop, Add1ArgType),
NEONMAP1(vcntq_v, ctpop, Add1ArgType),
NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
+ NEONMAP0(vcvt_f16_v),
NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
NEONMAP0(vcvt_f32_v),
NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
@@ -3587,6 +4003,7 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0),
NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
+ NEONMAP1(vcvta_u16_v, arm_neon_vcvtau, 0),
NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0),
@@ -3631,6 +4048,7 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0),
NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
+ NEONMAP0(vcvtq_f16_v),
NEONMAP0(vcvtq_f32_v),
NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
@@ -3646,6 +4064,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP0(vcvtq_u16_v),
NEONMAP0(vcvtq_u32_v),
NEONMAP0(vcvtq_u64_v),
+ NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0),
+ NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0),
NEONMAP0(vext_v),
NEONMAP0(vextq_v),
NEONMAP0(vfma_v),
@@ -3656,18 +4076,30 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
NEONMAP0(vld1_dup_v),
NEONMAP1(vld1_v, arm_neon_vld1, 0),
+ NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
+ NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
+ NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
NEONMAP0(vld1q_dup_v),
NEONMAP1(vld1q_v, arm_neon_vld1, 0),
+ NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
+ NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
+ NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
+ NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
NEONMAP1(vld2_v, arm_neon_vld2, 0),
+ NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
NEONMAP1(vld2q_v, arm_neon_vld2, 0),
+ NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
NEONMAP1(vld3_v, arm_neon_vld3, 0),
+ NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
NEONMAP1(vld3q_v, arm_neon_vld3, 0),
+ NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
NEONMAP1(vld4_v, arm_neon_vld4, 0),
+ NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
NEONMAP1(vld4q_v, arm_neon_vld4, 0),
NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
@@ -3726,6 +4158,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
+ NEONMAP0(vrndi_v),
+ NEONMAP0(vrndiq_v),
NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
@@ -3759,7 +4193,13 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
NEONMAP0(vshrn_n_v),
NEONMAP0(vshrq_n_v),
NEONMAP1(vst1_v, arm_neon_vst1, 0),
+ NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
+ NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
+ NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
NEONMAP1(vst1q_v, arm_neon_vst1, 0),
+ NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
+ NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
+ NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
NEONMAP1(vst2_v, arm_neon_vst2, 0),
NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
@@ -3799,8 +4239,18 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
+ NEONMAP0(vceqz_v),
+ NEONMAP0(vceqzq_v),
+ NEONMAP0(vcgez_v),
+ NEONMAP0(vcgezq_v),
+ NEONMAP0(vcgtz_v),
+ NEONMAP0(vcgtzq_v),
+ NEONMAP0(vclez_v),
+ NEONMAP0(vclezq_v),
NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
+ NEONMAP0(vcltz_v),
+ NEONMAP0(vcltzq_v),
NEONMAP1(vclz_v, ctlz, Add1ArgType),
NEONMAP1(vclzq_v, ctlz, Add1ArgType),
NEONMAP1(vcnt_v, ctpop, Add1ArgType),
@@ -3830,6 +4280,8 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
+ NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
+ NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
NEONMAP0(vext_v),
NEONMAP0(vextq_v),
NEONMAP0(vfma_v),
@@ -3838,6 +4290,12 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
+ NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
+ NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
+ NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
+ NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
+ NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
+ NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
NEONMAP0(vmovl_v),
NEONMAP0(vmovn_v),
NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
@@ -3878,6 +4336,8 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
+ NEONMAP0(vrndi_v),
+ NEONMAP0(vrndiq_v),
NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
@@ -3901,6 +4361,12 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(vshr_n_v),
NEONMAP0(vshrn_n_v),
NEONMAP0(vshrq_n_v),
+ NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
+ NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
+ NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
+ NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
+ NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
+ NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
NEONMAP0(vsubhn_v),
NEONMAP0(vtst_v),
NEONMAP0(vtstq_v),
@@ -4099,6 +4565,37 @@ static const NeonIntrinsicInfo AArch64SISDIntrinsicMap[] = {
NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
+ // FP16 scalar intrinisics go here.
+ NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
+ NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+ NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
+ NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
+ NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
+ NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
+ NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
+ NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
};
#undef NEONMAP0
@@ -4248,8 +4745,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
NeonTypeFlags Type(NeonTypeConst.getZExtValue());
bool Usgn = Type.isUnsigned();
bool Quad = Type.isQuad();
+ const bool HasLegalHalfType = getTarget().hasLegalHalfType();
- llvm::VectorType *VTy = GetNeonType(this, Type, Arch);
+ llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType);
llvm::Type *Ty = VTy;
if (!Ty)
return nullptr;
@@ -4314,6 +4812,26 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
return EmitNeonCall(F, Ops, NameHint);
}
+ case NEON::BI__builtin_neon_vceqz_v:
+ case NEON::BI__builtin_neon_vceqzq_v:
+ return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
+ ICmpInst::ICMP_EQ, "vceqz");
+ case NEON::BI__builtin_neon_vcgez_v:
+ case NEON::BI__builtin_neon_vcgezq_v:
+ return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
+ ICmpInst::ICMP_SGE, "vcgez");
+ case NEON::BI__builtin_neon_vclez_v:
+ case NEON::BI__builtin_neon_vclezq_v:
+ return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
+ ICmpInst::ICMP_SLE, "vclez");
+ case NEON::BI__builtin_neon_vcgtz_v:
+ case NEON::BI__builtin_neon_vcgtzq_v:
+ return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
+ ICmpInst::ICMP_SGT, "vcgtz");
+ case NEON::BI__builtin_neon_vcltz_v:
+ case NEON::BI__builtin_neon_vcltzq_v:
+ return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
+ ICmpInst::ICMP_SLT, "vcltz");
case NEON::BI__builtin_neon_vclz_v:
case NEON::BI__builtin_neon_vclzq_v:
// We generate target-independent intrinsic, which needs a second argument
@@ -4323,13 +4841,15 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
case NEON::BI__builtin_neon_vcvt_f32_v:
case NEON::BI__builtin_neon_vcvtq_f32_v:
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
- Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad), Arch);
+ Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
+ HasLegalHalfType);
return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
: Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
case NEON::BI__builtin_neon_vcvt_f16_v:
case NEON::BI__builtin_neon_vcvtq_f16_v:
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
- Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad), Arch);
+ Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
+ HasLegalHalfType);
return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
: Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
case NEON::BI__builtin_neon_vcvt_n_f16_v:
@@ -4378,6 +4898,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
case NEON::BI__builtin_neon_vcvta_s16_v:
case NEON::BI__builtin_neon_vcvta_s32_v:
case NEON::BI__builtin_neon_vcvta_s64_v:
+ case NEON::BI__builtin_neon_vcvta_u16_v:
case NEON::BI__builtin_neon_vcvta_u32_v:
case NEON::BI__builtin_neon_vcvta_u64_v:
case NEON::BI__builtin_neon_vcvtaq_s16_v:
@@ -4452,12 +4973,33 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
Ops.push_back(getAlignmentValue32(PtrOp0));
return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
}
+ case NEON::BI__builtin_neon_vld1_x2_v:
+ case NEON::BI__builtin_neon_vld1q_x2_v:
+ case NEON::BI__builtin_neon_vld1_x3_v:
+ case NEON::BI__builtin_neon_vld1q_x3_v:
+ case NEON::BI__builtin_neon_vld1_x4_v:
+ case NEON::BI__builtin_neon_vld1q_x4_v: {
+ llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
+ Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
+ llvm::Type *Tys[2] = { VTy, PTy };
+ Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
+ Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
+ Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
+ Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+ return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
+ }
case NEON::BI__builtin_neon_vld2_v:
case NEON::BI__builtin_neon_vld2q_v:
case NEON::BI__builtin_neon_vld3_v:
case NEON::BI__builtin_neon_vld3q_v:
case NEON::BI__builtin_neon_vld4_v:
- case NEON::BI__builtin_neon_vld4q_v: {
+ case NEON::BI__builtin_neon_vld4q_v:
+ case NEON::BI__builtin_neon_vld2_dup_v:
+ case NEON::BI__builtin_neon_vld2q_dup_v:
+ case NEON::BI__builtin_neon_vld3_dup_v:
+ case NEON::BI__builtin_neon_vld3q_dup_v:
+ case NEON::BI__builtin_neon_vld4_dup_v:
+ case NEON::BI__builtin_neon_vld4q_dup_v: {
llvm::Type *Tys[] = {Ty, Int8PtrTy};
Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
Value *Align = getAlignmentValue32(PtrOp1);
@@ -4556,7 +5098,10 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
case NEON::BI__builtin_neon_vrsqrteq_v:
Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
-
+ case NEON::BI__builtin_neon_vrndi_v:
+ case NEON::BI__builtin_neon_vrndiq_v:
+ Int = Intrinsic::nearbyint;
+ return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
case NEON::BI__builtin_neon_vrshr_n_v:
case NEON::BI__builtin_neon_vrshrq_n_v:
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
@@ -4607,6 +5152,23 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
Ops.push_back(getAlignmentValue32(PtrOp0));
return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
}
+ case NEON::BI__builtin_neon_vst1_x2_v:
+ case NEON::BI__builtin_neon_vst1q_x2_v:
+ case NEON::BI__builtin_neon_vst1_x3_v:
+ case NEON::BI__builtin_neon_vst1q_x3_v:
+ case NEON::BI__builtin_neon_vst1_x4_v:
+ case NEON::BI__builtin_neon_vst1q_x4_v: {
+ llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
+ // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
+ // in AArch64 it comes last. We may want to stick to one or another.
+ if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) {
+ llvm::Type *Tys[2] = { VTy, PTy };
+ std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
+ return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
+ }
+ llvm::Type *Tys[2] = { PTy, VTy };
+ return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
+ }
case NEON::BI__builtin_neon_vsubhn_v: {
llvm::VectorType *SrcTy =
llvm::VectorType::getExtendedElementVectorType(VTy);
@@ -4689,6 +5251,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
}
return SV;
}
+ case NEON::BI__builtin_neon_vdot_v:
+ case NEON::BI__builtin_neon_vdotq_v: {
+ llvm::Type *InputTy =
+ llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+ llvm::Type *Tys[2] = { Ty, InputTy };
+ Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
+ }
}
assert(Int && "Expected valid intrinsic number");
@@ -4897,6 +5467,34 @@ static bool HasExtraNeonArgument(unsigned BuiltinID) {
return true;
}
+Value *CodeGenFunction::EmitISOVolatileLoad(const CallExpr *E) {
+ Value *Ptr = EmitScalarExpr(E->getArg(0));
+ QualType ElTy = E->getArg(0)->getType()->getPointeeType();
+ CharUnits LoadSize = getContext().getTypeSizeInChars(ElTy);
+ llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
+ LoadSize.getQuantity() * 8);
+ Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
+ llvm::LoadInst *Load =
+ Builder.CreateAlignedLoad(Ptr, LoadSize);
+ Load->setVolatile(true);
+ return Load;
+}
+
+Value *CodeGenFunction::EmitISOVolatileStore(const CallExpr *E) {
+ Value *Ptr = EmitScalarExpr(E->getArg(0));
+ Value *Value = EmitScalarExpr(E->getArg(1));
+ QualType ElTy = E->getArg(0)->getType()->getPointeeType();
+ CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
+ llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
+ StoreSize.getQuantity() * 8);
+ Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
+ llvm::StoreInst *Store =
+ Builder.CreateAlignedStore(Value, Ptr,
+ StoreSize);
+ Store->setVolatile(true);
+ return Store;
+}
+
Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
const CallExpr *E,
llvm::Triple::ArchType Arch) {
@@ -5139,35 +5737,13 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
case ARM::BI__iso_volatile_load8:
case ARM::BI__iso_volatile_load16:
case ARM::BI__iso_volatile_load32:
- case ARM::BI__iso_volatile_load64: {
- Value *Ptr = EmitScalarExpr(E->getArg(0));
- QualType ElTy = E->getArg(0)->getType()->getPointeeType();
- CharUnits LoadSize = getContext().getTypeSizeInChars(ElTy);
- llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
- LoadSize.getQuantity() * 8);
- Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
- llvm::LoadInst *Load =
- Builder.CreateAlignedLoad(Ptr, LoadSize);
- Load->setVolatile(true);
- return Load;
- }
+ case ARM::BI__iso_volatile_load64:
+ return EmitISOVolatileLoad(E);
case ARM::BI__iso_volatile_store8:
case ARM::BI__iso_volatile_store16:
case ARM::BI__iso_volatile_store32:
- case ARM::BI__iso_volatile_store64: {
- Value *Ptr = EmitScalarExpr(E->getArg(0));
- Value *Value = EmitScalarExpr(E->getArg(1));
- QualType ElTy = E->getArg(0)->getType()->getPointeeType();
- CharUnits StoreSize = getContext().getTypeSizeInChars(ElTy);
- llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
- StoreSize.getQuantity() * 8);
- Ptr = Builder.CreateBitCast(Ptr, ITy->getPointerTo());
- llvm::StoreInst *Store =
- Builder.CreateAlignedStore(Value, Ptr,
- StoreSize);
- Store->setVolatile(true);
- return Store;
- }
+ case ARM::BI__iso_volatile_store64:
+ return EmitISOVolatileStore(E);
}
if (BuiltinID == ARM::BI__builtin_arm_clrex) {
@@ -5312,8 +5888,11 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vld4_lane_v:
case NEON::BI__builtin_neon_vld4q_lane_v:
case NEON::BI__builtin_neon_vld2_dup_v:
+ case NEON::BI__builtin_neon_vld2q_dup_v:
case NEON::BI__builtin_neon_vld3_dup_v:
+ case NEON::BI__builtin_neon_vld3q_dup_v:
case NEON::BI__builtin_neon_vld4_dup_v:
+ case NEON::BI__builtin_neon_vld4q_dup_v:
// Get the alignment for the argument in addition to the value;
// we'll use it later.
PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
@@ -5349,6 +5928,12 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vgetq_lane_f32:
return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
+ case NEON::BI__builtin_neon_vrndns_f32: {
+ Value *Arg = EmitScalarExpr(E->getArg(0));
+ llvm::Type *Tys[] = {Arg->getType()};
+ Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
+ return Builder.CreateCall(F, {Arg}, "vrndn"); }
+
case NEON::BI__builtin_neon_vset_lane_i8:
case NEON::BI__builtin_neon_vset_lane_i16:
case NEON::BI__builtin_neon_vset_lane_i32:
@@ -5438,7 +6023,8 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
bool usgn = Type.isUnsigned();
bool rightShift = false;
- llvm::VectorType *VTy = GetNeonType(this, Type, Arch);
+ llvm::VectorType *VTy = GetNeonType(this, Type,
+ getTarget().hasLegalHalfType());
llvm::Type *Ty = VTy;
if (!Ty)
return nullptr;
@@ -5483,68 +6069,6 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
Value *Ld = Builder.CreateLoad(PtrOp0);
return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
}
- case NEON::BI__builtin_neon_vld2_dup_v:
- case NEON::BI__builtin_neon_vld3_dup_v:
- case NEON::BI__builtin_neon_vld4_dup_v: {
- // Handle 64-bit elements as a special-case. There is no "dup" needed.
- if (VTy->getElementType()->getPrimitiveSizeInBits() == 64) {
- switch (BuiltinID) {
- case NEON::BI__builtin_neon_vld2_dup_v:
- Int = Intrinsic::arm_neon_vld2;
- break;
- case NEON::BI__builtin_neon_vld3_dup_v:
- Int = Intrinsic::arm_neon_vld3;
- break;
- case NEON::BI__builtin_neon_vld4_dup_v:
- Int = Intrinsic::arm_neon_vld4;
- break;
- default: llvm_unreachable("unknown vld_dup intrinsic?");
- }
- llvm::Type *Tys[] = {Ty, Int8PtrTy};
- Function *F = CGM.getIntrinsic(Int, Tys);
- llvm::Value *Align = getAlignmentValue32(PtrOp1);
- Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, "vld_dup");
- Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
- Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
- return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
- }
- switch (BuiltinID) {
- case NEON::BI__builtin_neon_vld2_dup_v:
- Int = Intrinsic::arm_neon_vld2lane;
- break;
- case NEON::BI__builtin_neon_vld3_dup_v:
- Int = Intrinsic::arm_neon_vld3lane;
- break;
- case NEON::BI__builtin_neon_vld4_dup_v:
- Int = Intrinsic::arm_neon_vld4lane;
- break;
- default: llvm_unreachable("unknown vld_dup intrinsic?");
- }
- llvm::Type *Tys[] = {Ty, Int8PtrTy};
- Function *F = CGM.getIntrinsic(Int, Tys);
- llvm::StructType *STy = cast<llvm::StructType>(F->getReturnType());
-
- SmallVector<Value*, 6> Args;
- Args.push_back(Ops[1]);
- Args.append(STy->getNumElements(), UndefValue::get(Ty));
-
- llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
- Args.push_back(CI);
- Args.push_back(getAlignmentValue32(PtrOp1));
-
- Ops[1] = Builder.CreateCall(F, Args, "vld_dup");
- // splat lane 0 to all elts in each vector of the result.
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- Value *Val = Builder.CreateExtractValue(Ops[1], i);
- Value *Elt = Builder.CreateBitCast(Val, Ty);
- Elt = EmitNeonSplat(Elt, CI);
- Elt = Builder.CreateBitCast(Elt, Val->getType());
- Ops[1] = Builder.CreateInsertValue(Ops[1], Elt, i);
- }
- Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
- Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
- return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
- }
case NEON::BI__builtin_neon_vqrshrn_n_v:
Int =
usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
@@ -5684,7 +6208,7 @@ static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID
// Determine the type of this overloaded NEON intrinsic.
NeonTypeFlags Type(Result.getZExtValue());
- llvm::VectorType *Ty = GetNeonType(&CGF, Type, Arch);
+ llvm::VectorType *Ty = GetNeonType(&CGF, Type);
if (!Ty)
return nullptr;
@@ -5803,18 +6327,23 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
HintID = 0;
break;
case AArch64::BI__builtin_arm_yield:
+ case AArch64::BI__yield:
HintID = 1;
break;
case AArch64::BI__builtin_arm_wfe:
+ case AArch64::BI__wfe:
HintID = 2;
break;
case AArch64::BI__builtin_arm_wfi:
+ case AArch64::BI__wfi:
HintID = 3;
break;
case AArch64::BI__builtin_arm_sev:
+ case AArch64::BI__sev:
HintID = 4;
break;
case AArch64::BI__builtin_arm_sevl:
+ case AArch64::BI__sevl:
HintID = 5;
break;
}
@@ -6081,6 +6610,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
// Handle non-overloaded intrinsics first.
switch (BuiltinID) {
default: break;
+ case NEON::BI__builtin_neon_vabsh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
case NEON::BI__builtin_neon_vldrq_p128: {
llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
llvm::Type *Int128PTy = llvm::PointerType::get(Int128Ty, 0);
@@ -6123,6 +6655,153 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateUIToFP(Ops[0], FTy);
return Builder.CreateSIToFP(Ops[0], FTy);
}
+ case NEON::BI__builtin_neon_vcvth_f16_u16:
+ case NEON::BI__builtin_neon_vcvth_f16_u32:
+ case NEON::BI__builtin_neon_vcvth_f16_u64:
+ usgn = true;
+ // FALL THROUGH
+ case NEON::BI__builtin_neon_vcvth_f16_s16:
+ case NEON::BI__builtin_neon_vcvth_f16_s32:
+ case NEON::BI__builtin_neon_vcvth_f16_s64: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ llvm::Type *FTy = HalfTy;
+ llvm::Type *InTy;
+ if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
+ InTy = Int64Ty;
+ else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
+ InTy = Int32Ty;
+ else
+ InTy = Int16Ty;
+ Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
+ if (usgn)
+ return Builder.CreateUIToFP(Ops[0], FTy);
+ return Builder.CreateSIToFP(Ops[0], FTy);
+ }
+ case NEON::BI__builtin_neon_vcvth_u16_f16:
+ usgn = true;
+ // FALL THROUGH
+ case NEON::BI__builtin_neon_vcvth_s16_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+ if (usgn)
+ return Builder.CreateFPToUI(Ops[0], Int16Ty);
+ return Builder.CreateFPToSI(Ops[0], Int16Ty);
+ }
+ case NEON::BI__builtin_neon_vcvth_u32_f16:
+ usgn = true;
+ // FALL THROUGH
+ case NEON::BI__builtin_neon_vcvth_s32_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+ if (usgn)
+ return Builder.CreateFPToUI(Ops[0], Int32Ty);
+ return Builder.CreateFPToSI(Ops[0], Int32Ty);
+ }
+ case NEON::BI__builtin_neon_vcvth_u64_f16:
+ usgn = true;
+ // FALL THROUGH
+ case NEON::BI__builtin_neon_vcvth_s64_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+ if (usgn)
+ return Builder.CreateFPToUI(Ops[0], Int64Ty);
+ return Builder.CreateFPToSI(Ops[0], Int64Ty);
+ }
+ case NEON::BI__builtin_neon_vcvtah_u16_f16:
+ case NEON::BI__builtin_neon_vcvtmh_u16_f16:
+ case NEON::BI__builtin_neon_vcvtnh_u16_f16:
+ case NEON::BI__builtin_neon_vcvtph_u16_f16:
+ case NEON::BI__builtin_neon_vcvtah_s16_f16:
+ case NEON::BI__builtin_neon_vcvtmh_s16_f16:
+ case NEON::BI__builtin_neon_vcvtnh_s16_f16:
+ case NEON::BI__builtin_neon_vcvtph_s16_f16: {
+ unsigned Int;
+ llvm::Type* InTy = Int32Ty;
+ llvm::Type* FTy = HalfTy;
+ llvm::Type *Tys[2] = {InTy, FTy};
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ switch (BuiltinID) {
+ default: llvm_unreachable("missing builtin ID in switch!");
+ case NEON::BI__builtin_neon_vcvtah_u16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtau; break;
+ case NEON::BI__builtin_neon_vcvtmh_u16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtmu; break;
+ case NEON::BI__builtin_neon_vcvtnh_u16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtnu; break;
+ case NEON::BI__builtin_neon_vcvtph_u16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtpu; break;
+ case NEON::BI__builtin_neon_vcvtah_s16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtas; break;
+ case NEON::BI__builtin_neon_vcvtmh_s16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtms; break;
+ case NEON::BI__builtin_neon_vcvtnh_s16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtns; break;
+ case NEON::BI__builtin_neon_vcvtph_s16_f16:
+ Int = Intrinsic::aarch64_neon_fcvtps; break;
+ }
+ Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
+ return Builder.CreateTrunc(Ops[0], Int16Ty);
+ }
+ case NEON::BI__builtin_neon_vcaleh_f16:
+ case NEON::BI__builtin_neon_vcalth_f16:
+ case NEON::BI__builtin_neon_vcageh_f16:
+ case NEON::BI__builtin_neon_vcagth_f16: {
+ unsigned Int;
+ llvm::Type* InTy = Int32Ty;
+ llvm::Type* FTy = HalfTy;
+ llvm::Type *Tys[2] = {InTy, FTy};
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ switch (BuiltinID) {
+ default: llvm_unreachable("missing builtin ID in switch!");
+ case NEON::BI__builtin_neon_vcageh_f16:
+ Int = Intrinsic::aarch64_neon_facge; break;
+ case NEON::BI__builtin_neon_vcagth_f16:
+ Int = Intrinsic::aarch64_neon_facgt; break;
+ case NEON::BI__builtin_neon_vcaleh_f16:
+ Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
+ case NEON::BI__builtin_neon_vcalth_f16:
+ Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
+ }
+ Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
+ return Builder.CreateTrunc(Ops[0], Int16Ty);
+ }
+ case NEON::BI__builtin_neon_vcvth_n_s16_f16:
+ case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
+ unsigned Int;
+ llvm::Type* InTy = Int32Ty;
+ llvm::Type* FTy = HalfTy;
+ llvm::Type *Tys[2] = {InTy, FTy};
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ switch (BuiltinID) {
+ default: llvm_unreachable("missing builtin ID in switch!");
+ case NEON::BI__builtin_neon_vcvth_n_s16_f16:
+ Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
+ case NEON::BI__builtin_neon_vcvth_n_u16_f16:
+ Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
+ }
+ Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
+ return Builder.CreateTrunc(Ops[0], Int16Ty);
+ }
+ case NEON::BI__builtin_neon_vcvth_n_f16_s16:
+ case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
+ unsigned Int;
+ llvm::Type* FTy = HalfTy;
+ llvm::Type* InTy = Int32Ty;
+ llvm::Type *Tys[2] = {FTy, InTy};
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ switch (BuiltinID) {
+ default: llvm_unreachable("missing builtin ID in switch!");
+ case NEON::BI__builtin_neon_vcvth_n_f16_s16:
+ Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
+ Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
+ break;
+ case NEON::BI__builtin_neon_vcvth_n_f16_u16:
+ Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
+ Ops[0] = Builder.CreateZExt(Ops[0], InTy);
+ break;
+ }
+ return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
+ }
case NEON::BI__builtin_neon_vpaddd_s64: {
llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2);
Value *Vec = EmitScalarExpr(E->getArg(0));
@@ -6164,6 +6843,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vceqzd_s64:
case NEON::BI__builtin_neon_vceqzd_f64:
case NEON::BI__builtin_neon_vceqzs_f32:
+ case NEON::BI__builtin_neon_vceqzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6171,6 +6851,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vcgezd_s64:
case NEON::BI__builtin_neon_vcgezd_f64:
case NEON::BI__builtin_neon_vcgezs_f32:
+ case NEON::BI__builtin_neon_vcgezh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6178,6 +6859,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vclezd_s64:
case NEON::BI__builtin_neon_vclezd_f64:
case NEON::BI__builtin_neon_vclezs_f32:
+ case NEON::BI__builtin_neon_vclezh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6185,6 +6867,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vcgtzd_s64:
case NEON::BI__builtin_neon_vcgtzd_f64:
case NEON::BI__builtin_neon_vcgtzs_f32:
+ case NEON::BI__builtin_neon_vcgtzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6192,6 +6875,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vcltzd_s64:
case NEON::BI__builtin_neon_vcltzd_f64:
case NEON::BI__builtin_neon_vcltzs_f32:
+ case NEON::BI__builtin_neon_vcltzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
@@ -6244,6 +6928,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
}
+ case NEON::BI__builtin_neon_vceqh_f16:
+ case NEON::BI__builtin_neon_vcleh_f16:
+ case NEON::BI__builtin_neon_vclth_f16:
+ case NEON::BI__builtin_neon_vcgeh_f16:
+ case NEON::BI__builtin_neon_vcgth_f16: {
+ llvm::CmpInst::Predicate P;
+ switch (BuiltinID) {
+ default: llvm_unreachable("missing builtin ID in switch!");
+ case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
+ case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
+ case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
+ case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
+ case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
+ }
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
+ Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
+ Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
+ return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
+ }
case NEON::BI__builtin_neon_vceqd_s64:
case NEON::BI__builtin_neon_vceqd_u64:
case NEON::BI__builtin_neon_vcgtd_s64:
@@ -6381,6 +7085,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
llvm::VectorType::get(DoubleTy, 2));
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
"vgetq_lane");
+ case NEON::BI__builtin_neon_vaddh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
+ case NEON::BI__builtin_neon_vsubh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
+ case NEON::BI__builtin_neon_vmulh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
+ case NEON::BI__builtin_neon_vdivh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
+ case NEON::BI__builtin_neon_vfmah_f16: {
+ Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+ // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+ return Builder.CreateCall(F,
+ {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
+ }
+ case NEON::BI__builtin_neon_vfmsh_f16: {
+ Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy);
+ Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy);
+ Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh");
+ // NEON intrinsic puts accumulator first, unlike the LLVM fma.
+ return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]});
+ }
case NEON::BI__builtin_neon_vaddd_s64:
case NEON::BI__builtin_neon_vaddd_u64:
return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
@@ -6538,7 +7267,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
}
}
- llvm::VectorType *VTy = GetNeonType(this, Type, Arch);
+ llvm::VectorType *VTy = GetNeonType(this, Type);
llvm::Type *Ty = VTy;
if (!Ty)
return nullptr;
@@ -6603,7 +7332,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
llvm::Type *VTy = GetNeonType(this,
- NeonTypeFlags(NeonTypeFlags::Float64, false, true), Arch);
+ NeonTypeFlags(NeonTypeFlags::Float64, false, true));
Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
Value *F = CGM.getIntrinsic(Intrinsic::fma, DoubleTy);
@@ -6655,12 +7384,22 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
+ case NEON::BI__builtin_neon_vmaxh_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ Int = Intrinsic::aarch64_neon_fmax;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
+ }
case NEON::BI__builtin_neon_vmin_v:
case NEON::BI__builtin_neon_vminq_v:
// FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
+ case NEON::BI__builtin_neon_vminh_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ Int = Intrinsic::aarch64_neon_fmin;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
+ }
case NEON::BI__builtin_neon_vabd_v:
case NEON::BI__builtin_neon_vabdq_v:
// FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
@@ -6699,20 +7438,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vminnmq_v:
Int = Intrinsic::aarch64_neon_fminnm;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
+ case NEON::BI__builtin_neon_vminnmh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ Int = Intrinsic::aarch64_neon_fminnm;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
case NEON::BI__builtin_neon_vmaxnm_v:
case NEON::BI__builtin_neon_vmaxnmq_v:
Int = Intrinsic::aarch64_neon_fmaxnm;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
+ case NEON::BI__builtin_neon_vmaxnmh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ Int = Intrinsic::aarch64_neon_fmaxnm;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
case NEON::BI__builtin_neon_vrecpss_f32: {
Ops.push_back(EmitScalarExpr(E->getArg(1)));
return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
Ops, "vrecps");
}
- case NEON::BI__builtin_neon_vrecpsd_f64: {
+ case NEON::BI__builtin_neon_vrecpsd_f64:
Ops.push_back(EmitScalarExpr(E->getArg(1)));
return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
Ops, "vrecps");
- }
+ case NEON::BI__builtin_neon_vrecpsh_f16:
+ Ops.push_back(EmitScalarExpr(E->getArg(1)));
+ return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
+ Ops, "vrecps");
case NEON::BI__builtin_neon_vqshrun_n_v:
Int = Intrinsic::aarch64_neon_sqshrun;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
@@ -6728,72 +7478,87 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vqrshrn_n_v:
Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
+ case NEON::BI__builtin_neon_vrndah_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::round;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
+ }
case NEON::BI__builtin_neon_vrnda_v:
case NEON::BI__builtin_neon_vrndaq_v: {
Int = Intrinsic::round;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
}
- case NEON::BI__builtin_neon_vrndi_v:
- case NEON::BI__builtin_neon_vrndiq_v: {
+ case NEON::BI__builtin_neon_vrndih_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
Int = Intrinsic::nearbyint;
- return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi");
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
+ }
+ case NEON::BI__builtin_neon_vrndmh_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::floor;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
}
case NEON::BI__builtin_neon_vrndm_v:
case NEON::BI__builtin_neon_vrndmq_v: {
Int = Intrinsic::floor;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
}
+ case NEON::BI__builtin_neon_vrndnh_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::aarch64_neon_frintn;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
+ }
case NEON::BI__builtin_neon_vrndn_v:
case NEON::BI__builtin_neon_vrndnq_v: {
Int = Intrinsic::aarch64_neon_frintn;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
}
+ case NEON::BI__builtin_neon_vrndns_f32: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::aarch64_neon_frintn;
+ return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
+ }
+ case NEON::BI__builtin_neon_vrndph_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::ceil;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
+ }
case NEON::BI__builtin_neon_vrndp_v:
case NEON::BI__builtin_neon_vrndpq_v: {
Int = Intrinsic::ceil;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
}
+ case NEON::BI__builtin_neon_vrndxh_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::rint;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
+ }
case NEON::BI__builtin_neon_vrndx_v:
case NEON::BI__builtin_neon_vrndxq_v: {
Int = Intrinsic::rint;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
}
+ case NEON::BI__builtin_neon_vrndh_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::trunc;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
+ }
case NEON::BI__builtin_neon_vrnd_v:
case NEON::BI__builtin_neon_vrndq_v: {
Int = Intrinsic::trunc;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
}
- case NEON::BI__builtin_neon_vceqz_v:
- case NEON::BI__builtin_neon_vceqzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
- ICmpInst::ICMP_EQ, "vceqz");
- case NEON::BI__builtin_neon_vcgez_v:
- case NEON::BI__builtin_neon_vcgezq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
- ICmpInst::ICMP_SGE, "vcgez");
- case NEON::BI__builtin_neon_vclez_v:
- case NEON::BI__builtin_neon_vclezq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
- ICmpInst::ICMP_SLE, "vclez");
- case NEON::BI__builtin_neon_vcgtz_v:
- case NEON::BI__builtin_neon_vcgtzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
- ICmpInst::ICMP_SGT, "vcgtz");
- case NEON::BI__builtin_neon_vcltz_v:
- case NEON::BI__builtin_neon_vcltzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
- ICmpInst::ICMP_SLT, "vcltz");
case NEON::BI__builtin_neon_vcvt_f64_v:
case NEON::BI__builtin_neon_vcvtq_f64_v:
Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
- Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad), Arch);
+ Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
: Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
case NEON::BI__builtin_neon_vcvt_f64_f32: {
assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
"unexpected vcvt_f64_f32 builtin");
NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
- Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag, Arch));
+ Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
}
@@ -6801,7 +7566,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
assert(Type.getEltType() == NeonTypeFlags::Float32 &&
"unexpected vcvt_f32_f64 builtin");
NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
- Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag, Arch));
+ Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
}
@@ -6809,20 +7574,21 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
case NEON::BI__builtin_neon_vcvt_u32_v:
case NEON::BI__builtin_neon_vcvt_s64_v:
case NEON::BI__builtin_neon_vcvt_u64_v:
- case NEON::BI__builtin_neon_vcvt_s16_v:
- case NEON::BI__builtin_neon_vcvt_u16_v:
+ case NEON::BI__builtin_neon_vcvt_s16_v:
+ case NEON::BI__builtin_neon_vcvt_u16_v:
case NEON::BI__builtin_neon_vcvtq_s32_v:
case NEON::BI__builtin_neon_vcvtq_u32_v:
case NEON::BI__builtin_neon_vcvtq_s64_v:
case NEON::BI__builtin_neon_vcvtq_u64_v:
- case NEON::BI__builtin_neon_vcvtq_s16_v:
- case NEON::BI__builtin_neon_vcvtq_u16_v: {
+ case NEON::BI__builtin_neon_vcvtq_s16_v:
+ case NEON::BI__builtin_neon_vcvtq_u16_v: {
Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
if (usgn)
return Builder.CreateFPToUI(Ops[0], Ty);
return Builder.CreateFPToSI(Ops[0], Ty);
}
case NEON::BI__builtin_neon_vcvta_s16_v:
+ case NEON::BI__builtin_neon_vcvta_u16_v:
case NEON::BI__builtin_neon_vcvta_s32_v:
case NEON::BI__builtin_neon_vcvtaq_s16_v:
case NEON::BI__builtin_neon_vcvtaq_s32_v:
@@ -6890,6 +7656,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_fmulx;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
}
+ case NEON::BI__builtin_neon_vmulxh_lane_f16:
+ case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
+ // vmulx_lane should be mapped to Neon scalar mulx after
+ // extracting the scalar element
+ Ops.push_back(EmitScalarExpr(E->getArg(2)));
+ Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
+ Ops.pop_back();
+ Int = Intrinsic::aarch64_neon_fmulx;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
+ }
case NEON::BI__builtin_neon_vmul_lane_v:
case NEON::BI__builtin_neon_vmul_laneq_v: {
// v1f64 vmul_lane should be mapped to Neon scalar mul lane
@@ -6898,7 +7674,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Quad = true;
Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
llvm::Type *VTy = GetNeonType(this,
- NeonTypeFlags(NeonTypeFlags::Float64, false, Quad), Arch);
+ NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
@@ -6906,6 +7682,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
}
case NEON::BI__builtin_neon_vnegd_s64:
return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
+ case NEON::BI__builtin_neon_vnegh_f16:
+ return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
case NEON::BI__builtin_neon_vpmaxnm_v:
case NEON::BI__builtin_neon_vpmaxnmq_v: {
Int = Intrinsic::aarch64_neon_fmaxnmp;
@@ -6916,6 +7694,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_fminnmp;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
}
+ case NEON::BI__builtin_neon_vsqrth_f16: {
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ Int = Intrinsic::sqrt;
+ return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
+ }
case NEON::BI__builtin_neon_vsqrt_v:
case NEON::BI__builtin_neon_vsqrtq_v: {
Int = Intrinsic::sqrt;
@@ -7293,64 +8076,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
return Builder.CreateAdd(Ops[0], tmp);
}
- // FIXME: Sharing loads & stores with 32-bit is complicated by the absence
- // of an Align parameter here.
- case NEON::BI__builtin_neon_vld1_x2_v:
- case NEON::BI__builtin_neon_vld1q_x2_v:
- case NEON::BI__builtin_neon_vld1_x3_v:
- case NEON::BI__builtin_neon_vld1q_x3_v:
- case NEON::BI__builtin_neon_vld1_x4_v:
- case NEON::BI__builtin_neon_vld1q_x4_v: {
- llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
- Ops[1] = Builder.CreateBitCast(Ops[1], PTy);
- llvm::Type *Tys[2] = { VTy, PTy };
- unsigned Int;
- switch (BuiltinID) {
- case NEON::BI__builtin_neon_vld1_x2_v:
- case NEON::BI__builtin_neon_vld1q_x2_v:
- Int = Intrinsic::aarch64_neon_ld1x2;
- break;
- case NEON::BI__builtin_neon_vld1_x3_v:
- case NEON::BI__builtin_neon_vld1q_x3_v:
- Int = Intrinsic::aarch64_neon_ld1x3;
- break;
- case NEON::BI__builtin_neon_vld1_x4_v:
- case NEON::BI__builtin_neon_vld1q_x4_v:
- Int = Intrinsic::aarch64_neon_ld1x4;
- break;
- }
- Function *F = CGM.getIntrinsic(Int, Tys);
- Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
- Ty = llvm::PointerType::getUnqual(Ops[1]->getType());
- Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
- return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
- }
- case NEON::BI__builtin_neon_vst1_x2_v:
- case NEON::BI__builtin_neon_vst1q_x2_v:
- case NEON::BI__builtin_neon_vst1_x3_v:
- case NEON::BI__builtin_neon_vst1q_x3_v:
- case NEON::BI__builtin_neon_vst1_x4_v:
- case NEON::BI__builtin_neon_vst1q_x4_v: {
- llvm::Type *PTy = llvm::PointerType::getUnqual(VTy->getVectorElementType());
- llvm::Type *Tys[2] = { VTy, PTy };
- unsigned Int;
- switch (BuiltinID) {
- case NEON::BI__builtin_neon_vst1_x2_v:
- case NEON::BI__builtin_neon_vst1q_x2_v:
- Int = Intrinsic::aarch64_neon_st1x2;
- break;
- case NEON::BI__builtin_neon_vst1_x3_v:
- case NEON::BI__builtin_neon_vst1q_x3_v:
- Int = Intrinsic::aarch64_neon_st1x3;
- break;
- case NEON::BI__builtin_neon_vst1_x4_v:
- case NEON::BI__builtin_neon_vst1q_x4_v:
- Int = Intrinsic::aarch64_neon_st1x4;
- break;
- }
- std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
- return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
- }
case NEON::BI__builtin_neon_vld1_v:
case NEON::BI__builtin_neon_vld1q_v: {
Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy));
@@ -7657,6 +8382,38 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_suqadd;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
}
+ case AArch64::BI__iso_volatile_load8:
+ case AArch64::BI__iso_volatile_load16:
+ case AArch64::BI__iso_volatile_load32:
+ case AArch64::BI__iso_volatile_load64:
+ return EmitISOVolatileLoad(E);
+ case AArch64::BI__iso_volatile_store8:
+ case AArch64::BI__iso_volatile_store16:
+ case AArch64::BI__iso_volatile_store32:
+ case AArch64::BI__iso_volatile_store64:
+ return EmitISOVolatileStore(E);
+ case AArch64::BI_BitScanForward:
+ case AArch64::BI_BitScanForward64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanForward, E);
+ case AArch64::BI_BitScanReverse:
+ case AArch64::BI_BitScanReverse64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_BitScanReverse, E);
+ case AArch64::BI_InterlockedAnd64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E);
+ case AArch64::BI_InterlockedExchange64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E);
+ case AArch64::BI_InterlockedExchangeAdd64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E);
+ case AArch64::BI_InterlockedExchangeSub64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E);
+ case AArch64::BI_InterlockedOr64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E);
+ case AArch64::BI_InterlockedXor64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E);
+ case AArch64::BI_InterlockedDecrement64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E);
+ case AArch64::BI_InterlockedIncrement64:
+ return EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E);
}
}
@@ -7708,42 +8465,66 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
}
static Value *EmitX86MaskedStore(CodeGenFunction &CGF,
- SmallVectorImpl<Value *> &Ops,
+ ArrayRef<Value *> Ops,
unsigned Align) {
// Cast the pointer to right type.
- Ops[0] = CGF.Builder.CreateBitCast(Ops[0],
+ Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
llvm::PointerType::getUnqual(Ops[1]->getType()));
- // If the mask is all ones just emit a regular store.
- if (const auto *C = dyn_cast<Constant>(Ops[2]))
- if (C->isAllOnesValue())
- return CGF.Builder.CreateAlignedStore(Ops[1], Ops[0], Align);
-
Value *MaskVec = getMaskVecValue(CGF, Ops[2],
Ops[1]->getType()->getVectorNumElements());
- return CGF.Builder.CreateMaskedStore(Ops[1], Ops[0], Align, MaskVec);
+ return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Align, MaskVec);
}
static Value *EmitX86MaskedLoad(CodeGenFunction &CGF,
- SmallVectorImpl<Value *> &Ops, unsigned Align) {
+ ArrayRef<Value *> Ops, unsigned Align) {
// Cast the pointer to right type.
- Ops[0] = CGF.Builder.CreateBitCast(Ops[0],
+ Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
llvm::PointerType::getUnqual(Ops[1]->getType()));
- // If the mask is all ones just emit a regular store.
- if (const auto *C = dyn_cast<Constant>(Ops[2]))
- if (C->isAllOnesValue())
- return CGF.Builder.CreateAlignedLoad(Ops[0], Align);
-
Value *MaskVec = getMaskVecValue(CGF, Ops[2],
Ops[1]->getType()->getVectorNumElements());
- return CGF.Builder.CreateMaskedLoad(Ops[0], Align, MaskVec, Ops[1]);
+ return CGF.Builder.CreateMaskedLoad(Ptr, Align, MaskVec, Ops[1]);
+}
+
+static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
+ ArrayRef<Value *> Ops) {
+ llvm::Type *ResultTy = Ops[1]->getType();
+ llvm::Type *PtrTy = ResultTy->getVectorElementType();
+
+ // Cast the pointer to element type.
+ Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
+ llvm::PointerType::getUnqual(PtrTy));
+
+ Value *MaskVec = getMaskVecValue(CGF, Ops[2],
+ ResultTy->getVectorNumElements());
+
+ llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
+ ResultTy);
+ return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
+}
+
+static Value *EmitX86CompressStore(CodeGenFunction &CGF,
+ ArrayRef<Value *> Ops) {
+ llvm::Type *ResultTy = Ops[1]->getType();
+ llvm::Type *PtrTy = ResultTy->getVectorElementType();
+
+ // Cast the pointer to element type.
+ Value *Ptr = CGF.Builder.CreateBitCast(Ops[0],
+ llvm::PointerType::getUnqual(PtrTy));
+
+ Value *MaskVec = getMaskVecValue(CGF, Ops[2],
+ ResultTy->getVectorNumElements());
+
+ llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
+ ResultTy);
+ return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
}
static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
- unsigned NumElts, SmallVectorImpl<Value *> &Ops,
+ unsigned NumElts, ArrayRef<Value *> Ops,
bool InvertLHS = false) {
Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
@@ -7755,26 +8536,6 @@ static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
CGF.Builder.getIntNTy(std::max(NumElts, 8U)));
}
-static Value *EmitX86SubVectorBroadcast(CodeGenFunction &CGF,
- SmallVectorImpl<Value *> &Ops,
- llvm::Type *DstTy,
- unsigned SrcSizeInBits,
- unsigned Align) {
- // Load the subvector.
- Ops[0] = CGF.Builder.CreateAlignedLoad(Ops[0], Align);
-
- // Create broadcast mask.
- unsigned NumDstElts = DstTy->getVectorNumElements();
- unsigned NumSrcElts = SrcSizeInBits / DstTy->getScalarSizeInBits();
-
- SmallVector<uint32_t, 8> Mask;
- for (unsigned i = 0; i != NumDstElts; i += NumSrcElts)
- for (unsigned j = 0; j != NumSrcElts; ++j)
- Mask.push_back(j);
-
- return CGF.Builder.CreateShuffleVector(Ops[0], Ops[0], Mask, "subvecbcst");
-}
-
static Value *EmitX86Select(CodeGenFunction &CGF,
Value *Mask, Value *Op0, Value *Op1) {
@@ -7788,8 +8549,48 @@ static Value *EmitX86Select(CodeGenFunction &CGF,
return CGF.Builder.CreateSelect(Mask, Op0, Op1);
}
+static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
+ Value *Mask, Value *Op0, Value *Op1) {
+ // If the mask is all ones just return first argument.
+ if (const auto *C = dyn_cast<Constant>(Mask))
+ if (C->isAllOnesValue())
+ return Op0;
+
+ llvm::VectorType *MaskTy =
+ llvm::VectorType::get(CGF.Builder.getInt1Ty(),
+ Mask->getType()->getIntegerBitWidth());
+ Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
+ Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
+ return CGF.Builder.CreateSelect(Mask, Op0, Op1);
+}
+
+static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
+ unsigned NumElts, Value *MaskIn) {
+ if (MaskIn) {
+ const auto *C = dyn_cast<Constant>(MaskIn);
+ if (!C || !C->isAllOnesValue())
+ Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
+ }
+
+ if (NumElts < 8) {
+ uint32_t Indices[8];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i;
+ for (unsigned i = NumElts; i != 8; ++i)
+ Indices[i] = i % NumElts + NumElts;
+ Cmp = CGF.Builder.CreateShuffleVector(
+ Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
+ }
+
+ return CGF.Builder.CreateBitCast(Cmp,
+ IntegerType::get(CGF.getLLVMContext(),
+ std::max(NumElts, 8U)));
+}
+
static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
- bool Signed, SmallVectorImpl<Value *> &Ops) {
+ bool Signed, ArrayRef<Value *> Ops) {
+ assert((Ops.size() == 2 || Ops.size() == 4) &&
+ "Unexpected number of arguments");
unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
Value *Cmp;
@@ -7813,22 +8614,16 @@ static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
}
- const auto *C = dyn_cast<Constant>(Ops.back());
- if (!C || !C->isAllOnesValue())
- Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, Ops.back(), NumElts));
+ Value *MaskIn = nullptr;
+ if (Ops.size() == 4)
+ MaskIn = Ops[3];
- if (NumElts < 8) {
- uint32_t Indices[8];
- for (unsigned i = 0; i != NumElts; ++i)
- Indices[i] = i;
- for (unsigned i = NumElts; i != 8; ++i)
- Indices[i] = i % NumElts + NumElts;
- Cmp = CGF.Builder.CreateShuffleVector(
- Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
- }
- return CGF.Builder.CreateBitCast(Cmp,
- IntegerType::get(CGF.getLLVMContext(),
- std::max(NumElts, 8U)));
+ return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
+}
+
+static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
+ Value *Zero = Constant::getNullValue(In->getType());
+ return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
}
static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) {
@@ -7838,9 +8633,7 @@ static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) {
Value *Sub = CGF.Builder.CreateSub(Zero, Ops[0]);
Value *Cmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Zero);
Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Sub);
- if (Ops.size() == 1)
- return Res;
- return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
+ return Res;
}
static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred,
@@ -7848,14 +8641,214 @@ static Value *EmitX86MinMax(CodeGenFunction &CGF, ICmpInst::Predicate Pred,
Value *Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
Value *Res = CGF.Builder.CreateSelect(Cmp, Ops[0], Ops[1]);
- if (Ops.size() == 2)
- return Res;
+ assert(Ops.size() == 2);
+ return Res;
+}
+
+// Lowers X86 FMA intrinsics to IR.
+static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
+ unsigned BuiltinID, bool IsAddSub) {
+
+ bool Subtract = false;
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ switch (BuiltinID) {
+ default: break;
+ case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
+ IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
+ case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
+ IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
+ case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+ IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+ Subtract = true;
+ LLVM_FALLTHROUGH;
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+ IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
+ break;
+ }
+
+ Value *A = Ops[0];
+ Value *B = Ops[1];
+ Value *C = Ops[2];
+
+ if (Subtract)
+ C = CGF.Builder.CreateFNeg(C);
+
+ Value *Res;
+
+ // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
+ if (IID != Intrinsic::not_intrinsic &&
+ cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4) {
+ Function *Intr = CGF.CGM.getIntrinsic(IID);
+ Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
+ } else {
+ llvm::Type *Ty = A->getType();
+ Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
+ Res = CGF.Builder.CreateCall(FMA, {A, B, C} );
+
+ if (IsAddSub) {
+ // Negate even elts in C using a mask.
+ unsigned NumElts = Ty->getVectorNumElements();
+ SmallVector<uint32_t, 16> Indices(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i + (i % 2) * NumElts;
+
+ Value *NegC = CGF.Builder.CreateFNeg(C);
+ Value *FMSub = CGF.Builder.CreateCall(FMA, {A, B, NegC} );
+ Res = CGF.Builder.CreateShuffleVector(FMSub, Res, Indices);
+ }
+ }
+
+ // Handle any required masking.
+ Value *MaskFalseVal = nullptr;
+ switch (BuiltinID) {
+ case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+ MaskFalseVal = Ops[0];
+ break;
+ case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+ MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
+ break;
+ case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+ case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+ MaskFalseVal = Ops[2];
+ break;
+ }
+
+ if (MaskFalseVal)
+ return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
+
+ return Res;
+}
+
+static Value *
+EmitScalarFMAExpr(CodeGenFunction &CGF, MutableArrayRef<Value *> Ops,
+ Value *Upper, bool ZeroMask = false, unsigned PTIdx = 0,
+ bool NegAcc = false) {
+ unsigned Rnd = 4;
+ if (Ops.size() > 4)
+ Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
+
+ if (NegAcc)
+ Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
+
+ Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+ Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+ Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+ Value *Res;
+ if (Rnd != 4) {
+ Intrinsic::ID IID = Ops[0]->getType()->getPrimitiveSizeInBits() == 32 ?
+ Intrinsic::x86_avx512_vfmadd_f32 :
+ Intrinsic::x86_avx512_vfmadd_f64;
+ Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
+ {Ops[0], Ops[1], Ops[2], Ops[4]});
+ } else {
+ Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
+ Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
+ }
+ // If we have more than 3 arguments, we need to do masking.
+ if (Ops.size() > 3) {
+ Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
+ : Ops[PTIdx];
- assert(Ops.size() == 4);
- return EmitX86Select(CGF, Ops[3], Res, Ops[2]);
+ // If we negated the accumulator and the its the PassThru value we need to
+ // bypass the negate. Conveniently Upper should be the same thing in this
+ // case.
+ if (NegAcc && PTIdx == 2)
+ PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
+
+ Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
+ }
+ return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
+}
+
+static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
+ ArrayRef<Value *> Ops) {
+ llvm::Type *Ty = Ops[0]->getType();
+ // Arguments have a vXi32 type so cast to vXi64.
+ Ty = llvm::VectorType::get(CGF.Int64Ty,
+ Ty->getPrimitiveSizeInBits() / 64);
+ Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
+ Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
+
+ if (IsSigned) {
+ // Shift left then arithmetic shift right.
+ Constant *ShiftAmt = ConstantInt::get(Ty, 32);
+ LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
+ LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
+ RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
+ RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
+ } else {
+ // Clear the upper bits.
+ Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
+ LHS = CGF.Builder.CreateAnd(LHS, Mask);
+ RHS = CGF.Builder.CreateAnd(RHS, Mask);
+ }
+
+ return CGF.Builder.CreateMul(LHS, RHS);
}
-static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
+// Emit a masked pternlog intrinsic. This only exists because the header has to
+// use a macro and we aren't able to pass the input argument to a pternlog
+// builtin and a select builtin without evaluating it twice.
+static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
+ ArrayRef<Value *> Ops) {
+ llvm::Type *Ty = Ops[0]->getType();
+
+ unsigned VecWidth = Ty->getPrimitiveSizeInBits();
+ unsigned EltWidth = Ty->getScalarSizeInBits();
+ Intrinsic::ID IID;
+ if (VecWidth == 128 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_pternlog_d_128;
+ else if (VecWidth == 256 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_pternlog_d_256;
+ else if (VecWidth == 512 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_pternlog_d_512;
+ else if (VecWidth == 128 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_pternlog_q_128;
+ else if (VecWidth == 256 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_pternlog_q_256;
+ else if (VecWidth == 512 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_pternlog_q_512;
+ else
+ llvm_unreachable("Unexpected intrinsic");
+
+ Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
+ Ops.drop_back());
+ Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
+ return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
+}
+
+static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
llvm::Type *DstTy) {
unsigned NumberOfElements = DstTy->getVectorNumElements();
Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
@@ -7918,11 +8911,10 @@ Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
return EmitX86CpuSupports(FeatureStr);
}
-Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
+uint32_t
+CodeGenFunction::GetX86CpuSupportsMask(ArrayRef<StringRef> FeatureStrs) {
// Processor features and mapping to processor feature value.
-
uint32_t FeaturesMask = 0;
-
for (const StringRef &FeatureStr : FeatureStrs) {
unsigned Feature =
StringSwitch<unsigned>(FeatureStr)
@@ -7931,7 +8923,14 @@ Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
;
FeaturesMask |= (1U << Feature);
}
+ return FeaturesMask;
+}
+Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
+ return EmitX86CpuSupports(GetX86CpuSupportsMask(FeatureStrs));
+}
+
+llvm::Value *CodeGenFunction::EmitX86CpuSupports(uint32_t FeaturesMask) {
// Matching the struct layout from the compiler-rt/libgcc structure that is
// filled in:
// unsigned int __cpu_vendor;
@@ -7953,9 +8952,9 @@ Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
Builder.CreateAlignedLoad(CpuFeatures, CharUnits::fromQuantity(4));
// Check the value of the bit corresponding to the feature requested.
- Value *Bitset = Builder.CreateAnd(
- Features, llvm::ConstantInt::get(Int32Ty, FeaturesMask));
- return Builder.CreateICmpNE(Bitset, llvm::ConstantInt::get(Int32Ty, 0));
+ Value *Mask = Builder.getInt32(FeaturesMask);
+ Value *Bitset = Builder.CreateAnd(Features, Mask);
+ return Builder.CreateICmpEQ(Bitset, Mask);
}
Value *CodeGenFunction::EmitX86CpuInit() {
@@ -8067,8 +9066,37 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateBitCast(BuildVector(Ops),
llvm::Type::getX86_MMXTy(getLLVMContext()));
case X86::BI__builtin_ia32_vec_ext_v2si:
- return Builder.CreateExtractElement(Ops[0],
- llvm::ConstantInt::get(Ops[1]->getType(), 0));
+ case X86::BI__builtin_ia32_vec_ext_v16qi:
+ case X86::BI__builtin_ia32_vec_ext_v8hi:
+ case X86::BI__builtin_ia32_vec_ext_v4si:
+ case X86::BI__builtin_ia32_vec_ext_v4sf:
+ case X86::BI__builtin_ia32_vec_ext_v2di:
+ case X86::BI__builtin_ia32_vec_ext_v32qi:
+ case X86::BI__builtin_ia32_vec_ext_v16hi:
+ case X86::BI__builtin_ia32_vec_ext_v8si:
+ case X86::BI__builtin_ia32_vec_ext_v4di: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
+ Index &= NumElts - 1;
+ // These builtins exist so we can ensure the index is an ICE and in range.
+ // Otherwise we could just do this in the header file.
+ return Builder.CreateExtractElement(Ops[0], Index);
+ }
+ case X86::BI__builtin_ia32_vec_set_v16qi:
+ case X86::BI__builtin_ia32_vec_set_v8hi:
+ case X86::BI__builtin_ia32_vec_set_v4si:
+ case X86::BI__builtin_ia32_vec_set_v2di:
+ case X86::BI__builtin_ia32_vec_set_v32qi:
+ case X86::BI__builtin_ia32_vec_set_v16hi:
+ case X86::BI__builtin_ia32_vec_set_v8si:
+ case X86::BI__builtin_ia32_vec_set_v4di: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
+ Index &= NumElts - 1;
+ // These builtins exist so we can ensure the index is an ICE and in range.
+ // Otherwise we could just do this in the header file.
+ return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
+ }
case X86::BI_mm_setcsr:
case X86::BI__builtin_ia32_ldmxcsr: {
Address Tmp = CreateMemTemp(E->getArg(0)->getType());
@@ -8145,7 +9173,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_storess128_mask:
case X86::BI__builtin_ia32_storesd128_mask: {
- return EmitX86MaskedStore(*this, Ops, 16);
+ return EmitX86MaskedStore(*this, Ops, 1);
}
case X86::BI__builtin_ia32_vpopcntb_128:
case X86::BI__builtin_ia32_vpopcntd_128:
@@ -8177,6 +9205,66 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_cvtmask2q512:
return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
+ case X86::BI__builtin_ia32_cvtb2mask128:
+ case X86::BI__builtin_ia32_cvtb2mask256:
+ case X86::BI__builtin_ia32_cvtb2mask512:
+ case X86::BI__builtin_ia32_cvtw2mask128:
+ case X86::BI__builtin_ia32_cvtw2mask256:
+ case X86::BI__builtin_ia32_cvtw2mask512:
+ case X86::BI__builtin_ia32_cvtd2mask128:
+ case X86::BI__builtin_ia32_cvtd2mask256:
+ case X86::BI__builtin_ia32_cvtd2mask512:
+ case X86::BI__builtin_ia32_cvtq2mask128:
+ case X86::BI__builtin_ia32_cvtq2mask256:
+ case X86::BI__builtin_ia32_cvtq2mask512:
+ return EmitX86ConvertToMask(*this, Ops[0]);
+
+ case X86::BI__builtin_ia32_vfmaddss3:
+ case X86::BI__builtin_ia32_vfmaddsd3:
+ case X86::BI__builtin_ia32_vfmaddss3_mask:
+ case X86::BI__builtin_ia32_vfmaddsd3_mask:
+ return EmitScalarFMAExpr(*this, Ops, Ops[0]);
+ case X86::BI__builtin_ia32_vfmaddss:
+ case X86::BI__builtin_ia32_vfmaddsd:
+ return EmitScalarFMAExpr(*this, Ops,
+ Constant::getNullValue(Ops[0]->getType()));
+ case X86::BI__builtin_ia32_vfmaddss3_maskz:
+ case X86::BI__builtin_ia32_vfmaddsd3_maskz:
+ return EmitScalarFMAExpr(*this, Ops, Ops[0], /*ZeroMask*/true);
+ case X86::BI__builtin_ia32_vfmaddss3_mask3:
+ case X86::BI__builtin_ia32_vfmaddsd3_mask3:
+ return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2);
+ case X86::BI__builtin_ia32_vfmsubss3_mask3:
+ case X86::BI__builtin_ia32_vfmsubsd3_mask3:
+ return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2,
+ /*NegAcc*/true);
+ case X86::BI__builtin_ia32_vfmaddps:
+ case X86::BI__builtin_ia32_vfmaddpd:
+ case X86::BI__builtin_ia32_vfmaddps256:
+ case X86::BI__builtin_ia32_vfmaddpd256:
+ case X86::BI__builtin_ia32_vfmaddps512_mask:
+ case X86::BI__builtin_ia32_vfmaddps512_maskz:
+ case X86::BI__builtin_ia32_vfmaddps512_mask3:
+ case X86::BI__builtin_ia32_vfmsubps512_mask3:
+ case X86::BI__builtin_ia32_vfmaddpd512_mask:
+ case X86::BI__builtin_ia32_vfmaddpd512_maskz:
+ case X86::BI__builtin_ia32_vfmaddpd512_mask3:
+ case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+ return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/false);
+ case X86::BI__builtin_ia32_vfmaddsubps:
+ case X86::BI__builtin_ia32_vfmaddsubpd:
+ case X86::BI__builtin_ia32_vfmaddsubps256:
+ case X86::BI__builtin_ia32_vfmaddsubpd256:
+ case X86::BI__builtin_ia32_vfmaddsubps512_mask:
+ case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+ case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+ case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+ case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+ case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+ case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+ case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+ return EmitX86FMAExpr(*this, Ops, BuiltinID, /*IsAddSub*/true);
+
case X86::BI__builtin_ia32_movdqa32store128_mask:
case X86::BI__builtin_ia32_movdqa64store128_mask:
case X86::BI__builtin_ia32_storeaps128_mask:
@@ -8215,7 +9303,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_loadss128_mask:
case X86::BI__builtin_ia32_loadsd128_mask:
- return EmitX86MaskedLoad(*this, Ops, 16);
+ return EmitX86MaskedLoad(*this, Ops, 1);
case X86::BI__builtin_ia32_loadaps128_mask:
case X86::BI__builtin_ia32_loadaps256_mask:
@@ -8234,11 +9322,45 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return EmitX86MaskedLoad(*this, Ops, Align);
}
- case X86::BI__builtin_ia32_vbroadcastf128_pd256:
- case X86::BI__builtin_ia32_vbroadcastf128_ps256: {
- llvm::Type *DstTy = ConvertType(E->getType());
- return EmitX86SubVectorBroadcast(*this, Ops, DstTy, 128, 1);
- }
+ case X86::BI__builtin_ia32_expandloaddf128_mask:
+ case X86::BI__builtin_ia32_expandloaddf256_mask:
+ case X86::BI__builtin_ia32_expandloaddf512_mask:
+ case X86::BI__builtin_ia32_expandloadsf128_mask:
+ case X86::BI__builtin_ia32_expandloadsf256_mask:
+ case X86::BI__builtin_ia32_expandloadsf512_mask:
+ case X86::BI__builtin_ia32_expandloaddi128_mask:
+ case X86::BI__builtin_ia32_expandloaddi256_mask:
+ case X86::BI__builtin_ia32_expandloaddi512_mask:
+ case X86::BI__builtin_ia32_expandloadsi128_mask:
+ case X86::BI__builtin_ia32_expandloadsi256_mask:
+ case X86::BI__builtin_ia32_expandloadsi512_mask:
+ case X86::BI__builtin_ia32_expandloadhi128_mask:
+ case X86::BI__builtin_ia32_expandloadhi256_mask:
+ case X86::BI__builtin_ia32_expandloadhi512_mask:
+ case X86::BI__builtin_ia32_expandloadqi128_mask:
+ case X86::BI__builtin_ia32_expandloadqi256_mask:
+ case X86::BI__builtin_ia32_expandloadqi512_mask:
+ return EmitX86ExpandLoad(*this, Ops);
+
+ case X86::BI__builtin_ia32_compressstoredf128_mask:
+ case X86::BI__builtin_ia32_compressstoredf256_mask:
+ case X86::BI__builtin_ia32_compressstoredf512_mask:
+ case X86::BI__builtin_ia32_compressstoresf128_mask:
+ case X86::BI__builtin_ia32_compressstoresf256_mask:
+ case X86::BI__builtin_ia32_compressstoresf512_mask:
+ case X86::BI__builtin_ia32_compressstoredi128_mask:
+ case X86::BI__builtin_ia32_compressstoredi256_mask:
+ case X86::BI__builtin_ia32_compressstoredi512_mask:
+ case X86::BI__builtin_ia32_compressstoresi128_mask:
+ case X86::BI__builtin_ia32_compressstoresi256_mask:
+ case X86::BI__builtin_ia32_compressstoresi512_mask:
+ case X86::BI__builtin_ia32_compressstorehi128_mask:
+ case X86::BI__builtin_ia32_compressstorehi256_mask:
+ case X86::BI__builtin_ia32_compressstorehi512_mask:
+ case X86::BI__builtin_ia32_compressstoreqi128_mask:
+ case X86::BI__builtin_ia32_compressstoreqi256_mask:
+ case X86::BI__builtin_ia32_compressstoreqi512_mask:
+ return EmitX86CompressStore(*this, Ops);
case X86::BI__builtin_ia32_storehps:
case X86::BI__builtin_ia32_storelps: {
@@ -8250,17 +9372,275 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// extract (0, 1)
unsigned Index = BuiltinID == X86::BI__builtin_ia32_storelps ? 0 : 1;
- llvm::Value *Idx = llvm::ConstantInt::get(SizeTy, Index);
- Ops[1] = Builder.CreateExtractElement(Ops[1], Idx, "extract");
+ Ops[1] = Builder.CreateExtractElement(Ops[1], Index, "extract");
// cast pointer to i64 & store
Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
}
+ case X86::BI__builtin_ia32_vextractf128_pd256:
+ case X86::BI__builtin_ia32_vextractf128_ps256:
+ case X86::BI__builtin_ia32_vextractf128_si256:
+ case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_extractf64x4_mask:
+ case X86::BI__builtin_ia32_extractf32x4_mask:
+ case X86::BI__builtin_ia32_extracti64x4_mask:
+ case X86::BI__builtin_ia32_extracti32x4_mask:
+ case X86::BI__builtin_ia32_extractf32x8_mask:
+ case X86::BI__builtin_ia32_extracti32x8_mask:
+ case X86::BI__builtin_ia32_extractf32x4_256_mask:
+ case X86::BI__builtin_ia32_extracti32x4_256_mask:
+ case X86::BI__builtin_ia32_extractf64x2_256_mask:
+ case X86::BI__builtin_ia32_extracti64x2_256_mask:
+ case X86::BI__builtin_ia32_extractf64x2_512_mask:
+ case X86::BI__builtin_ia32_extracti64x2_512_mask: {
+ llvm::Type *DstTy = ConvertType(E->getType());
+ unsigned NumElts = DstTy->getVectorNumElements();
+ unsigned SrcNumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned SubVectors = SrcNumElts / NumElts;
+ unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
+ assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
+ Index &= SubVectors - 1; // Remove any extra bits.
+ Index *= NumElts;
+
+ uint32_t Indices[16];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i + Index;
+
+ Value *Res = Builder.CreateShuffleVector(Ops[0],
+ UndefValue::get(Ops[0]->getType()),
+ makeArrayRef(Indices, NumElts),
+ "extract");
+
+ if (Ops.size() == 4)
+ Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
+
+ return Res;
+ }
+ case X86::BI__builtin_ia32_vinsertf128_pd256:
+ case X86::BI__builtin_ia32_vinsertf128_ps256:
+ case X86::BI__builtin_ia32_vinsertf128_si256:
+ case X86::BI__builtin_ia32_insert128i256:
+ case X86::BI__builtin_ia32_insertf64x4:
+ case X86::BI__builtin_ia32_insertf32x4:
+ case X86::BI__builtin_ia32_inserti64x4:
+ case X86::BI__builtin_ia32_inserti32x4:
+ case X86::BI__builtin_ia32_insertf32x8:
+ case X86::BI__builtin_ia32_inserti32x8:
+ case X86::BI__builtin_ia32_insertf32x4_256:
+ case X86::BI__builtin_ia32_inserti32x4_256:
+ case X86::BI__builtin_ia32_insertf64x2_256:
+ case X86::BI__builtin_ia32_inserti64x2_256:
+ case X86::BI__builtin_ia32_insertf64x2_512:
+ case X86::BI__builtin_ia32_inserti64x2_512: {
+ unsigned DstNumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned SrcNumElts = Ops[1]->getType()->getVectorNumElements();
+ unsigned SubVectors = DstNumElts / SrcNumElts;
+ unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
+ assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
+ Index &= SubVectors - 1; // Remove any extra bits.
+ Index *= SrcNumElts;
+
+ uint32_t Indices[16];
+ for (unsigned i = 0; i != DstNumElts; ++i)
+ Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
+
+ Value *Op1 = Builder.CreateShuffleVector(Ops[1],
+ UndefValue::get(Ops[1]->getType()),
+ makeArrayRef(Indices, DstNumElts),
+ "widen");
+
+ for (unsigned i = 0; i != DstNumElts; ++i) {
+ if (i >= Index && i < (Index + SrcNumElts))
+ Indices[i] = (i - Index) + DstNumElts;
+ else
+ Indices[i] = i;
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], Op1,
+ makeArrayRef(Indices, DstNumElts),
+ "insert");
+ }
+ case X86::BI__builtin_ia32_pmovqd512_mask:
+ case X86::BI__builtin_ia32_pmovwb512_mask: {
+ Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
+ return EmitX86Select(*this, Ops[2], Res, Ops[1]);
+ }
+ case X86::BI__builtin_ia32_pmovdb512_mask:
+ case X86::BI__builtin_ia32_pmovdw512_mask:
+ case X86::BI__builtin_ia32_pmovqw512_mask: {
+ if (const auto *C = dyn_cast<Constant>(Ops[2]))
+ if (C->isAllOnesValue())
+ return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
+
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ default: llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_pmovdb512_mask:
+ IID = Intrinsic::x86_avx512_mask_pmov_db_512;
+ break;
+ case X86::BI__builtin_ia32_pmovdw512_mask:
+ IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
+ break;
+ case X86::BI__builtin_ia32_pmovqw512_mask:
+ IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
+ break;
+ }
+
+ Function *Intr = CGM.getIntrinsic(IID);
+ return Builder.CreateCall(Intr, Ops);
+ }
+ case X86::BI__builtin_ia32_pblendw128:
+ case X86::BI__builtin_ia32_blendpd:
+ case X86::BI__builtin_ia32_blendps:
+ case X86::BI__builtin_ia32_blendpd256:
+ case X86::BI__builtin_ia32_blendps256:
+ case X86::BI__builtin_ia32_pblendw256:
+ case X86::BI__builtin_ia32_pblendd128:
+ case X86::BI__builtin_ia32_pblendd256: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+ uint32_t Indices[16];
+ // If there are more than 8 elements, the immediate is used twice so make
+ // sure we handle that.
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
+
+ return Builder.CreateShuffleVector(Ops[0], Ops[1],
+ makeArrayRef(Indices, NumElts),
+ "blend");
+ }
+ case X86::BI__builtin_ia32_pshuflw:
+ case X86::BI__builtin_ia32_pshuflw256:
+ case X86::BI__builtin_ia32_pshuflw512: {
+ uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ Imm = (Imm & 0xff) * 0x01010101;
+
+ uint32_t Indices[32];
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ for (unsigned i = 0; i != 4; ++i) {
+ Indices[l + i] = l + (Imm & 3);
+ Imm >>= 2;
+ }
+ for (unsigned i = 4; i != 8; ++i)
+ Indices[l + i] = l + i;
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
+ makeArrayRef(Indices, NumElts),
+ "pshuflw");
+ }
+ case X86::BI__builtin_ia32_pshufhw:
+ case X86::BI__builtin_ia32_pshufhw256:
+ case X86::BI__builtin_ia32_pshufhw512: {
+ uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ Imm = (Imm & 0xff) * 0x01010101;
+
+ uint32_t Indices[32];
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ for (unsigned i = 0; i != 4; ++i)
+ Indices[l + i] = l + i;
+ for (unsigned i = 4; i != 8; ++i) {
+ Indices[l + i] = l + 4 + (Imm & 3);
+ Imm >>= 2;
+ }
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
+ makeArrayRef(Indices, NumElts),
+ "pshufhw");
+ }
+ case X86::BI__builtin_ia32_pshufd:
+ case X86::BI__builtin_ia32_pshufd256:
+ case X86::BI__builtin_ia32_pshufd512:
+ case X86::BI__builtin_ia32_vpermilpd:
+ case X86::BI__builtin_ia32_vpermilps:
+ case X86::BI__builtin_ia32_vpermilpd256:
+ case X86::BI__builtin_ia32_vpermilps256:
+ case X86::BI__builtin_ia32_vpermilpd512:
+ case X86::BI__builtin_ia32_vpermilps512: {
+ uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+ unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ Imm = (Imm & 0xff) * 0x01010101;
+
+ uint32_t Indices[16];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ Indices[i + l] = (Imm % NumLaneElts) + l;
+ Imm /= NumLaneElts;
+ }
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
+ makeArrayRef(Indices, NumElts),
+ "permil");
+ }
+ case X86::BI__builtin_ia32_shufpd:
+ case X86::BI__builtin_ia32_shufpd256:
+ case X86::BI__builtin_ia32_shufpd512:
+ case X86::BI__builtin_ia32_shufps:
+ case X86::BI__builtin_ia32_shufps256:
+ case X86::BI__builtin_ia32_shufps512: {
+ uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+ unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+ Imm = (Imm & 0xff) * 0x01010101;
+
+ uint32_t Indices[16];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Index = Imm % NumLaneElts;
+ Imm /= NumLaneElts;
+ if (i >= (NumLaneElts / 2))
+ Index += NumElts;
+ Indices[l + i] = l + Index;
+ }
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], Ops[1],
+ makeArrayRef(Indices, NumElts),
+ "shufp");
+ }
+ case X86::BI__builtin_ia32_permdi256:
+ case X86::BI__builtin_ia32_permdf256:
+ case X86::BI__builtin_ia32_permdi512:
+ case X86::BI__builtin_ia32_permdf512: {
+ unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+
+ // These intrinsics operate on 256-bit lanes of four 64-bit elements.
+ uint32_t Indices[8];
+ for (unsigned l = 0; l != NumElts; l += 4)
+ for (unsigned i = 0; i != 4; ++i)
+ Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
+
+ return Builder.CreateShuffleVector(Ops[0], UndefValue::get(Ty),
+ makeArrayRef(Indices, NumElts),
+ "perm");
+ }
case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
- case X86::BI__builtin_ia32_palignr512_mask: {
- unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+ case X86::BI__builtin_ia32_palignr512: {
+ unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
assert(NumElts % 16 == 0);
@@ -8289,15 +9669,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
}
}
- Value *Align = Builder.CreateShuffleVector(Ops[1], Ops[0],
- makeArrayRef(Indices, NumElts),
- "palignr");
+ return Builder.CreateShuffleVector(Ops[1], Ops[0],
+ makeArrayRef(Indices, NumElts),
+ "palignr");
+ }
+ case X86::BI__builtin_ia32_alignd128:
+ case X86::BI__builtin_ia32_alignd256:
+ case X86::BI__builtin_ia32_alignd512:
+ case X86::BI__builtin_ia32_alignq128:
+ case X86::BI__builtin_ia32_alignq256:
+ case X86::BI__builtin_ia32_alignq512: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
+
+ // Mask the shift amount to width of two vectors.
+ ShiftVal &= (2 * NumElts) - 1;
- // If this isn't a masked builtin, just return the align operation.
- if (Ops.size() == 3)
- return Align;
+ uint32_t Indices[16];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i + ShiftVal;
- return EmitX86Select(*this, Ops[4], Align, Ops[3]);
+ return Builder.CreateShuffleVector(Ops[1], Ops[0],
+ makeArrayRef(Indices, NumElts),
+ "valign");
+ }
+ case X86::BI__builtin_ia32_shuf_f32x4_256:
+ case X86::BI__builtin_ia32_shuf_f64x2_256:
+ case X86::BI__builtin_ia32_shuf_i32x4_256:
+ case X86::BI__builtin_ia32_shuf_i64x2_256:
+ case X86::BI__builtin_ia32_shuf_f32x4:
+ case X86::BI__builtin_ia32_shuf_f64x2:
+ case X86::BI__builtin_ia32_shuf_i32x4:
+ case X86::BI__builtin_ia32_shuf_i64x2: {
+ unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+ llvm::Type *Ty = Ops[0]->getType();
+ unsigned NumElts = Ty->getVectorNumElements();
+ unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ uint32_t Indices[16];
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ unsigned Index = (Imm % NumLanes) * NumLaneElts;
+ Imm /= NumLanes; // Discard the bits we just used.
+ if (l >= (NumElts / 2))
+ Index += NumElts; // Switch to other source.
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ Indices[l + i] = Index + i;
+ }
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], Ops[1],
+ makeArrayRef(Indices, NumElts),
+ "shuf");
}
case X86::BI__builtin_ia32_vperm2f128_pd256:
@@ -8339,6 +9762,66 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
"vperm");
}
+ case X86::BI__builtin_ia32_pslldqi128_byteshift:
+ case X86::BI__builtin_ia32_pslldqi256_byteshift:
+ case X86::BI__builtin_ia32_pslldqi512_byteshift: {
+ unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
+ llvm::Type *ResultType = Ops[0]->getType();
+ // Builtin type is vXi64 so multiply by 8 to get bytes.
+ unsigned NumElts = ResultType->getVectorNumElements() * 8;
+
+ // If pslldq is shifting the vector more than 15 bytes, emit zero.
+ if (ShiftVal >= 16)
+ return llvm::Constant::getNullValue(ResultType);
+
+ uint32_t Indices[64];
+ // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
+ for (unsigned l = 0; l != NumElts; l += 16) {
+ for (unsigned i = 0; i != 16; ++i) {
+ unsigned Idx = NumElts + i - ShiftVal;
+ if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
+ Indices[l + i] = Idx + l;
+ }
+ }
+
+ llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts);
+ Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
+ Value *Zero = llvm::Constant::getNullValue(VecTy);
+ Value *SV = Builder.CreateShuffleVector(Zero, Cast,
+ makeArrayRef(Indices, NumElts),
+ "pslldq");
+ return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
+ }
+ case X86::BI__builtin_ia32_psrldqi128_byteshift:
+ case X86::BI__builtin_ia32_psrldqi256_byteshift:
+ case X86::BI__builtin_ia32_psrldqi512_byteshift: {
+ unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
+ llvm::Type *ResultType = Ops[0]->getType();
+ // Builtin type is vXi64 so multiply by 8 to get bytes.
+ unsigned NumElts = ResultType->getVectorNumElements() * 8;
+
+ // If psrldq is shifting the vector more than 15 bytes, emit zero.
+ if (ShiftVal >= 16)
+ return llvm::Constant::getNullValue(ResultType);
+
+ uint32_t Indices[64];
+ // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
+ for (unsigned l = 0; l != NumElts; l += 16) {
+ for (unsigned i = 0; i != 16; ++i) {
+ unsigned Idx = i + ShiftVal;
+ if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
+ Indices[l + i] = Idx + l;
+ }
+ }
+
+ llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, NumElts);
+ Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
+ Value *Zero = llvm::Constant::getNullValue(VecTy);
+ Value *SV = Builder.CreateShuffleVector(Cast, Zero,
+ makeArrayRef(Indices, NumElts),
+ "psrldq");
+ return Builder.CreateBitCast(SV, ResultType, "cast");
+ }
case X86::BI__builtin_ia32_movnti:
case X86::BI__builtin_ia32_movnti64:
case X86::BI__builtin_ia32_movntsd:
@@ -8384,6 +9867,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_selectpd_256:
case X86::BI__builtin_ia32_selectpd_512:
return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
+ case X86::BI__builtin_ia32_selectss_128:
+ case X86::BI__builtin_ia32_selectsd_128: {
+ Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+ Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+ A = EmitX86ScalarSelect(*this, Ops[0], A, B);
+ return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
+ }
case X86::BI__builtin_ia32_cmpb128_mask:
case X86::BI__builtin_ia32_cmpb256_mask:
case X86::BI__builtin_ia32_cmpb512_mask:
@@ -8415,6 +9905,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return EmitX86MaskedCompare(*this, CC, false, Ops);
}
+ case X86::BI__builtin_ia32_kortestchi:
+ case X86::BI__builtin_ia32_kortestzhi: {
+ Value *Or = EmitX86MaskLogic(*this, Instruction::Or, 16, Ops);
+ Value *C;
+ if (BuiltinID == X86::BI__builtin_ia32_kortestchi)
+ C = llvm::Constant::getAllOnesValue(Builder.getInt16Ty());
+ else
+ C = llvm::Constant::getNullValue(Builder.getInt16Ty());
+ Value *Cmp = Builder.CreateICmpEQ(Or, C);
+ return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
+ }
+
case X86::BI__builtin_ia32_kandhi:
return EmitX86MaskLogic(*this, Instruction::And, 16, Ops);
case X86::BI__builtin_ia32_kandnhi:
@@ -8431,85 +9933,176 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Builder.getInt16Ty());
}
- case X86::BI__builtin_ia32_vplzcntd_128_mask:
- case X86::BI__builtin_ia32_vplzcntd_256_mask:
- case X86::BI__builtin_ia32_vplzcntd_512_mask:
- case X86::BI__builtin_ia32_vplzcntq_128_mask:
- case X86::BI__builtin_ia32_vplzcntq_256_mask:
- case X86::BI__builtin_ia32_vplzcntq_512_mask: {
+ case X86::BI__builtin_ia32_kunpckdi:
+ case X86::BI__builtin_ia32_kunpcksi:
+ case X86::BI__builtin_ia32_kunpckhi: {
+ unsigned NumElts = Ops[0]->getType()->getScalarSizeInBits();
+ Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
+ Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
+ uint32_t Indices[64];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i;
+
+ // First extract half of each vector. This gives better codegen than
+ // doing it in a single shuffle.
+ LHS = Builder.CreateShuffleVector(LHS, LHS,
+ makeArrayRef(Indices, NumElts / 2));
+ RHS = Builder.CreateShuffleVector(RHS, RHS,
+ makeArrayRef(Indices, NumElts / 2));
+ // Concat the vectors.
+ // NOTE: Operands are swapped to match the intrinsic definition.
+ Value *Res = Builder.CreateShuffleVector(RHS, LHS,
+ makeArrayRef(Indices, NumElts));
+ return Builder.CreateBitCast(Res, Ops[0]->getType());
+ }
+
+ case X86::BI__builtin_ia32_vplzcntd_128:
+ case X86::BI__builtin_ia32_vplzcntd_256:
+ case X86::BI__builtin_ia32_vplzcntd_512:
+ case X86::BI__builtin_ia32_vplzcntq_128:
+ case X86::BI__builtin_ia32_vplzcntq_256:
+ case X86::BI__builtin_ia32_vplzcntq_512: {
Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
- return EmitX86Select(*this, Ops[2],
- Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)}),
- Ops[1]);
+ return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
+ }
+ case X86::BI__builtin_ia32_sqrtss:
+ case X86::BI__builtin_ia32_sqrtsd: {
+ Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+ Function *F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
+ A = Builder.CreateCall(F, {A});
+ return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
+ }
+ case X86::BI__builtin_ia32_sqrtsd_round_mask:
+ case X86::BI__builtin_ia32_sqrtss_round_mask: {
+ unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
+ // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+ // otherwise keep the intrinsic.
+ if (CC != 4) {
+ Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtsd_round_mask ?
+ Intrinsic::x86_avx512_mask_sqrt_sd :
+ Intrinsic::x86_avx512_mask_sqrt_ss;
+ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+ }
+ Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+ Function *F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
+ A = Builder.CreateCall(F, A);
+ Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+ A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
+ return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
+ }
+ case X86::BI__builtin_ia32_sqrtpd256:
+ case X86::BI__builtin_ia32_sqrtpd:
+ case X86::BI__builtin_ia32_sqrtps256:
+ case X86::BI__builtin_ia32_sqrtps:
+ case X86::BI__builtin_ia32_sqrtps512:
+ case X86::BI__builtin_ia32_sqrtpd512: {
+ if (Ops.size() == 2) {
+ unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+ // otherwise keep the intrinsic.
+ if (CC != 4) {
+ Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtps512 ?
+ Intrinsic::x86_avx512_sqrt_ps_512 :
+ Intrinsic::x86_avx512_sqrt_pd_512;
+ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+ }
+ }
+ Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
+ return Builder.CreateCall(F, Ops[0]);
}
-
case X86::BI__builtin_ia32_pabsb128:
case X86::BI__builtin_ia32_pabsw128:
case X86::BI__builtin_ia32_pabsd128:
case X86::BI__builtin_ia32_pabsb256:
case X86::BI__builtin_ia32_pabsw256:
case X86::BI__builtin_ia32_pabsd256:
- case X86::BI__builtin_ia32_pabsq128_mask:
- case X86::BI__builtin_ia32_pabsq256_mask:
- case X86::BI__builtin_ia32_pabsb512_mask:
- case X86::BI__builtin_ia32_pabsw512_mask:
- case X86::BI__builtin_ia32_pabsd512_mask:
- case X86::BI__builtin_ia32_pabsq512_mask:
+ case X86::BI__builtin_ia32_pabsq128:
+ case X86::BI__builtin_ia32_pabsq256:
+ case X86::BI__builtin_ia32_pabsb512:
+ case X86::BI__builtin_ia32_pabsw512:
+ case X86::BI__builtin_ia32_pabsd512:
+ case X86::BI__builtin_ia32_pabsq512:
return EmitX86Abs(*this, Ops);
case X86::BI__builtin_ia32_pmaxsb128:
case X86::BI__builtin_ia32_pmaxsw128:
case X86::BI__builtin_ia32_pmaxsd128:
- case X86::BI__builtin_ia32_pmaxsq128_mask:
+ case X86::BI__builtin_ia32_pmaxsq128:
case X86::BI__builtin_ia32_pmaxsb256:
case X86::BI__builtin_ia32_pmaxsw256:
case X86::BI__builtin_ia32_pmaxsd256:
- case X86::BI__builtin_ia32_pmaxsq256_mask:
- case X86::BI__builtin_ia32_pmaxsb512_mask:
- case X86::BI__builtin_ia32_pmaxsw512_mask:
- case X86::BI__builtin_ia32_pmaxsd512_mask:
- case X86::BI__builtin_ia32_pmaxsq512_mask:
+ case X86::BI__builtin_ia32_pmaxsq256:
+ case X86::BI__builtin_ia32_pmaxsb512:
+ case X86::BI__builtin_ia32_pmaxsw512:
+ case X86::BI__builtin_ia32_pmaxsd512:
+ case X86::BI__builtin_ia32_pmaxsq512:
return EmitX86MinMax(*this, ICmpInst::ICMP_SGT, Ops);
case X86::BI__builtin_ia32_pmaxub128:
case X86::BI__builtin_ia32_pmaxuw128:
case X86::BI__builtin_ia32_pmaxud128:
- case X86::BI__builtin_ia32_pmaxuq128_mask:
+ case X86::BI__builtin_ia32_pmaxuq128:
case X86::BI__builtin_ia32_pmaxub256:
case X86::BI__builtin_ia32_pmaxuw256:
case X86::BI__builtin_ia32_pmaxud256:
- case X86::BI__builtin_ia32_pmaxuq256_mask:
- case X86::BI__builtin_ia32_pmaxub512_mask:
- case X86::BI__builtin_ia32_pmaxuw512_mask:
- case X86::BI__builtin_ia32_pmaxud512_mask:
- case X86::BI__builtin_ia32_pmaxuq512_mask:
+ case X86::BI__builtin_ia32_pmaxuq256:
+ case X86::BI__builtin_ia32_pmaxub512:
+ case X86::BI__builtin_ia32_pmaxuw512:
+ case X86::BI__builtin_ia32_pmaxud512:
+ case X86::BI__builtin_ia32_pmaxuq512:
return EmitX86MinMax(*this, ICmpInst::ICMP_UGT, Ops);
case X86::BI__builtin_ia32_pminsb128:
case X86::BI__builtin_ia32_pminsw128:
case X86::BI__builtin_ia32_pminsd128:
- case X86::BI__builtin_ia32_pminsq128_mask:
+ case X86::BI__builtin_ia32_pminsq128:
case X86::BI__builtin_ia32_pminsb256:
case X86::BI__builtin_ia32_pminsw256:
case X86::BI__builtin_ia32_pminsd256:
- case X86::BI__builtin_ia32_pminsq256_mask:
- case X86::BI__builtin_ia32_pminsb512_mask:
- case X86::BI__builtin_ia32_pminsw512_mask:
- case X86::BI__builtin_ia32_pminsd512_mask:
- case X86::BI__builtin_ia32_pminsq512_mask:
+ case X86::BI__builtin_ia32_pminsq256:
+ case X86::BI__builtin_ia32_pminsb512:
+ case X86::BI__builtin_ia32_pminsw512:
+ case X86::BI__builtin_ia32_pminsd512:
+ case X86::BI__builtin_ia32_pminsq512:
return EmitX86MinMax(*this, ICmpInst::ICMP_SLT, Ops);
case X86::BI__builtin_ia32_pminub128:
case X86::BI__builtin_ia32_pminuw128:
case X86::BI__builtin_ia32_pminud128:
- case X86::BI__builtin_ia32_pminuq128_mask:
+ case X86::BI__builtin_ia32_pminuq128:
case X86::BI__builtin_ia32_pminub256:
case X86::BI__builtin_ia32_pminuw256:
case X86::BI__builtin_ia32_pminud256:
- case X86::BI__builtin_ia32_pminuq256_mask:
- case X86::BI__builtin_ia32_pminub512_mask:
- case X86::BI__builtin_ia32_pminuw512_mask:
- case X86::BI__builtin_ia32_pminud512_mask:
- case X86::BI__builtin_ia32_pminuq512_mask:
+ case X86::BI__builtin_ia32_pminuq256:
+ case X86::BI__builtin_ia32_pminub512:
+ case X86::BI__builtin_ia32_pminuw512:
+ case X86::BI__builtin_ia32_pminud512:
+ case X86::BI__builtin_ia32_pminuq512:
return EmitX86MinMax(*this, ICmpInst::ICMP_ULT, Ops);
+ case X86::BI__builtin_ia32_pmuludq128:
+ case X86::BI__builtin_ia32_pmuludq256:
+ case X86::BI__builtin_ia32_pmuludq512:
+ return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
+
+ case X86::BI__builtin_ia32_pmuldq128:
+ case X86::BI__builtin_ia32_pmuldq256:
+ case X86::BI__builtin_ia32_pmuldq512:
+ return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
+
+ case X86::BI__builtin_ia32_pternlogd512_mask:
+ case X86::BI__builtin_ia32_pternlogq512_mask:
+ case X86::BI__builtin_ia32_pternlogd128_mask:
+ case X86::BI__builtin_ia32_pternlogd256_mask:
+ case X86::BI__builtin_ia32_pternlogq128_mask:
+ case X86::BI__builtin_ia32_pternlogq256_mask:
+ return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
+
+ case X86::BI__builtin_ia32_pternlogd512_maskz:
+ case X86::BI__builtin_ia32_pternlogq512_maskz:
+ case X86::BI__builtin_ia32_pternlogd128_maskz:
+ case X86::BI__builtin_ia32_pternlogd256_maskz:
+ case X86::BI__builtin_ia32_pternlogq128_maskz:
+ case X86::BI__builtin_ia32_pternlogq256_maskz:
+ return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
+
// 3DNow!
case X86::BI__builtin_ia32_pswapdsf:
case X86::BI__builtin_ia32_pswapdsi: {
@@ -8553,7 +10146,44 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateExtractValue(Call, 1);
}
- // SSE packed comparison intrinsics
+ case X86::BI__builtin_ia32_fpclassps128_mask:
+ case X86::BI__builtin_ia32_fpclassps256_mask:
+ case X86::BI__builtin_ia32_fpclassps512_mask:
+ case X86::BI__builtin_ia32_fpclasspd128_mask:
+ case X86::BI__builtin_ia32_fpclasspd256_mask:
+ case X86::BI__builtin_ia32_fpclasspd512_mask: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ Value *MaskIn = Ops[2];
+ Ops.erase(&Ops[2]);
+
+ Intrinsic::ID ID;
+ switch (BuiltinID) {
+ default: llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_fpclassps128_mask:
+ ID = Intrinsic::x86_avx512_fpclass_ps_128;
+ break;
+ case X86::BI__builtin_ia32_fpclassps256_mask:
+ ID = Intrinsic::x86_avx512_fpclass_ps_256;
+ break;
+ case X86::BI__builtin_ia32_fpclassps512_mask:
+ ID = Intrinsic::x86_avx512_fpclass_ps_512;
+ break;
+ case X86::BI__builtin_ia32_fpclasspd128_mask:
+ ID = Intrinsic::x86_avx512_fpclass_pd_128;
+ break;
+ case X86::BI__builtin_ia32_fpclasspd256_mask:
+ ID = Intrinsic::x86_avx512_fpclass_pd_256;
+ break;
+ case X86::BI__builtin_ia32_fpclasspd512_mask:
+ ID = Intrinsic::x86_avx512_fpclass_pd_512;
+ break;
+ }
+
+ Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
+ return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
+ }
+
+ // packed comparison intrinsics
case X86::BI__builtin_ia32_cmpeqps:
case X86::BI__builtin_ia32_cmpeqpd:
return getVectorFCmpIR(CmpInst::FCMP_OEQ);
@@ -8581,64 +10211,79 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_cmpps:
case X86::BI__builtin_ia32_cmpps256:
case X86::BI__builtin_ia32_cmppd:
- case X86::BI__builtin_ia32_cmppd256: {
- unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
- // If this one of the SSE immediates, we can use native IR.
- if (CC < 8) {
- FCmpInst::Predicate Pred;
- switch (CC) {
- case 0: Pred = FCmpInst::FCMP_OEQ; break;
- case 1: Pred = FCmpInst::FCMP_OLT; break;
- case 2: Pred = FCmpInst::FCMP_OLE; break;
- case 3: Pred = FCmpInst::FCMP_UNO; break;
- case 4: Pred = FCmpInst::FCMP_UNE; break;
- case 5: Pred = FCmpInst::FCMP_UGE; break;
- case 6: Pred = FCmpInst::FCMP_UGT; break;
- case 7: Pred = FCmpInst::FCMP_ORD; break;
- }
- return getVectorFCmpIR(Pred);
+ case X86::BI__builtin_ia32_cmppd256:
+ case X86::BI__builtin_ia32_cmpps128_mask:
+ case X86::BI__builtin_ia32_cmpps256_mask:
+ case X86::BI__builtin_ia32_cmpps512_mask:
+ case X86::BI__builtin_ia32_cmppd128_mask:
+ case X86::BI__builtin_ia32_cmppd256_mask:
+ case X86::BI__builtin_ia32_cmppd512_mask: {
+ // Lowering vector comparisons to fcmp instructions, while
+ // ignoring signalling behaviour requested
+ // ignoring rounding mode requested
+ // This is is only possible as long as FENV_ACCESS is not implemented.
+ // See also: https://reviews.llvm.org/D45616
+
+ // The third argument is the comparison condition, and integer in the
+ // range [0, 31]
+ unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
+
+ // Lowering to IR fcmp instruction.
+ // Ignoring requested signaling behaviour,
+ // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
+ FCmpInst::Predicate Pred;
+ switch (CC) {
+ case 0x00: Pred = FCmpInst::FCMP_OEQ; break;
+ case 0x01: Pred = FCmpInst::FCMP_OLT; break;
+ case 0x02: Pred = FCmpInst::FCMP_OLE; break;
+ case 0x03: Pred = FCmpInst::FCMP_UNO; break;
+ case 0x04: Pred = FCmpInst::FCMP_UNE; break;
+ case 0x05: Pred = FCmpInst::FCMP_UGE; break;
+ case 0x06: Pred = FCmpInst::FCMP_UGT; break;
+ case 0x07: Pred = FCmpInst::FCMP_ORD; break;
+ case 0x08: Pred = FCmpInst::FCMP_UEQ; break;
+ case 0x09: Pred = FCmpInst::FCMP_ULT; break;
+ case 0x0a: Pred = FCmpInst::FCMP_ULE; break;
+ case 0x0b: Pred = FCmpInst::FCMP_FALSE; break;
+ case 0x0c: Pred = FCmpInst::FCMP_ONE; break;
+ case 0x0d: Pred = FCmpInst::FCMP_OGE; break;
+ case 0x0e: Pred = FCmpInst::FCMP_OGT; break;
+ case 0x0f: Pred = FCmpInst::FCMP_TRUE; break;
+ case 0x10: Pred = FCmpInst::FCMP_OEQ; break;
+ case 0x11: Pred = FCmpInst::FCMP_OLT; break;
+ case 0x12: Pred = FCmpInst::FCMP_OLE; break;
+ case 0x13: Pred = FCmpInst::FCMP_UNO; break;
+ case 0x14: Pred = FCmpInst::FCMP_UNE; break;
+ case 0x15: Pred = FCmpInst::FCMP_UGE; break;
+ case 0x16: Pred = FCmpInst::FCMP_UGT; break;
+ case 0x17: Pred = FCmpInst::FCMP_ORD; break;
+ case 0x18: Pred = FCmpInst::FCMP_UEQ; break;
+ case 0x19: Pred = FCmpInst::FCMP_ULT; break;
+ case 0x1a: Pred = FCmpInst::FCMP_ULE; break;
+ case 0x1b: Pred = FCmpInst::FCMP_FALSE; break;
+ case 0x1c: Pred = FCmpInst::FCMP_ONE; break;
+ case 0x1d: Pred = FCmpInst::FCMP_OGE; break;
+ case 0x1e: Pred = FCmpInst::FCMP_OGT; break;
+ case 0x1f: Pred = FCmpInst::FCMP_TRUE; break;
+ default: llvm_unreachable("Unhandled CC");
}
- // We can't handle 8-31 immediates with native IR, use the intrinsic.
- // Except for predicates that create constants.
- Intrinsic::ID ID;
+ // Builtins without the _mask suffix return a vector of integers
+ // of the same width as the input vectors
switch (BuiltinID) {
- default: llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_cmpps:
- ID = Intrinsic::x86_sse_cmp_ps;
- break;
- case X86::BI__builtin_ia32_cmpps256:
- // _CMP_TRUE_UQ, _CMP_TRUE_US produce -1,-1... vector
- // on any input and _CMP_FALSE_OQ, _CMP_FALSE_OS produce 0, 0...
- if (CC == 0xf || CC == 0xb || CC == 0x1b || CC == 0x1f) {
- Value *Constant = (CC == 0xf || CC == 0x1f) ?
- llvm::Constant::getAllOnesValue(Builder.getInt32Ty()) :
- llvm::Constant::getNullValue(Builder.getInt32Ty());
- Value *Vec = Builder.CreateVectorSplat(
- Ops[0]->getType()->getVectorNumElements(), Constant);
- return Builder.CreateBitCast(Vec, Ops[0]->getType());
- }
- ID = Intrinsic::x86_avx_cmp_ps_256;
- break;
- case X86::BI__builtin_ia32_cmppd:
- ID = Intrinsic::x86_sse2_cmp_pd;
- break;
- case X86::BI__builtin_ia32_cmppd256:
- // _CMP_TRUE_UQ, _CMP_TRUE_US produce -1,-1... vector
- // on any input and _CMP_FALSE_OQ, _CMP_FALSE_OS produce 0, 0...
- if (CC == 0xf || CC == 0xb || CC == 0x1b || CC == 0x1f) {
- Value *Constant = (CC == 0xf || CC == 0x1f) ?
- llvm::Constant::getAllOnesValue(Builder.getInt64Ty()) :
- llvm::Constant::getNullValue(Builder.getInt64Ty());
- Value *Vec = Builder.CreateVectorSplat(
- Ops[0]->getType()->getVectorNumElements(), Constant);
- return Builder.CreateBitCast(Vec, Ops[0]->getType());
- }
- ID = Intrinsic::x86_avx_cmp_pd_256;
- break;
+ case X86::BI__builtin_ia32_cmpps512_mask:
+ case X86::BI__builtin_ia32_cmppd512_mask:
+ case X86::BI__builtin_ia32_cmpps128_mask:
+ case X86::BI__builtin_ia32_cmpps256_mask:
+ case X86::BI__builtin_ia32_cmppd128_mask:
+ case X86::BI__builtin_ia32_cmppd256_mask: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
+ return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
+ }
+ default:
+ return getVectorFCmpIR(Pred);
}
-
- return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
}
// SSE scalar comparison intrinsics
@@ -8716,6 +10361,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
llvm::SyncScope::System);
}
+ case X86::BI__shiftleft128:
+ case X86::BI__shiftright128: {
+ // FIXME: Once fshl/fshr no longer add an unneeded and and cmov, do this:
+ // llvm::Function *F = CGM.getIntrinsic(
+ // BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
+ // Int64Ty);
+ // Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
+ // return Builder.CreateCall(F, Ops);
+ llvm::Type *Int128Ty = Builder.getInt128Ty();
+ Value *Val = Builder.CreateOr(
+ Builder.CreateShl(Builder.CreateZExt(Ops[1], Int128Ty), 64),
+ Builder.CreateZExt(Ops[0], Int128Ty));
+ Value *Amt = Builder.CreateAnd(Builder.CreateZExt(Ops[2], Int128Ty),
+ llvm::ConstantInt::get(Int128Ty, 0x3f));
+ Value *Res;
+ if (BuiltinID == X86::BI__shiftleft128)
+ Res = Builder.CreateLShr(Builder.CreateShl(Val, Amt), 64);
+ else
+ Res = Builder.CreateLShr(Val, Amt);
+ return Builder.CreateTrunc(Res, Int64Ty);
+ }
case X86::BI_ReadWriteBarrier:
case X86::BI_ReadBarrier:
case X86::BI_WriteBarrier: {
@@ -9199,19 +10865,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2));
Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2));
- // Element zero comes from the first input vector and element one comes from
- // the second. The element indices within each vector are numbered in big
- // endian order so the shuffle mask must be adjusted for this on little
- // endian platforms (i.e. index is complemented and source vector reversed).
- unsigned ElemIdx0;
- unsigned ElemIdx1;
- if (getTarget().isLittleEndian()) {
- ElemIdx0 = (~Index & 1) + 2;
- ElemIdx1 = (~Index & 2) >> 1;
- } else { // BigEndian
- ElemIdx0 = (Index & 2) >> 1;
- ElemIdx1 = 2 + (Index & 1);
- }
+ // Account for endianness by treating this as just a shuffle. So we use the
+ // same indices for both LE and BE in order to produce expected results in
+ // both cases.
+ unsigned ElemIdx0 = (Index & 2) >> 1;
+ unsigned ElemIdx1 = 2 + (Index & 1);
Constant *ShuffleElts[2] = {ConstantInt::get(Int32Ty, ElemIdx0),
ConstantInt::get(Int32Ty, ElemIdx1)};
@@ -9402,6 +11060,49 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CI->setConvergent();
return CI;
}
+ case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+ case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
+ llvm::SmallVector<llvm::Value *, 5> Args;
+ for (unsigned I = 0; I != 5; ++I)
+ Args.push_back(EmitScalarExpr(E->getArg(I)));
+ const llvm::Type *PtrTy = Args[0]->getType();
+ // check pointer parameter
+ if (!PtrTy->isPointerTy() ||
+ E->getArg(0)
+ ->getType()
+ ->getPointeeType()
+ .getQualifiers()
+ .getAddressSpace() != LangAS::opencl_local ||
+ !PtrTy->getPointerElementType()->isFloatTy()) {
+ CGM.Error(E->getArg(0)->getLocStart(),
+ "parameter should have type \"local float*\"");
+ return nullptr;
+ }
+ // check float parameter
+ if (!Args[1]->getType()->isFloatTy()) {
+ CGM.Error(E->getArg(1)->getLocStart(),
+ "parameter should have type \"float\"");
+ return nullptr;
+ }
+
+ Intrinsic::ID ID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_ds_faddf:
+ ID = Intrinsic::amdgcn_ds_fadd;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_fminf:
+ ID = Intrinsic::amdgcn_ds_fmin;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
+ ID = Intrinsic::amdgcn_ds_fmax;
+ break;
+ default:
+ llvm_unreachable("Unknown BuiltinID");
+ }
+ Value *F = CGM.getIntrinsic(ID);
+ return Builder.CreateCall(F, Args);
+ }
// amdgcn workitem
case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
@@ -10032,7 +11733,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__hmma_m16n16k16_ld_a:
case NVPTX::BI__hmma_m16n16k16_ld_b:
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
- case NVPTX::BI__hmma_m16n16k16_ld_c_f32: {
+ case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
+ case NVPTX::BI__hmma_m32n8k16_ld_a:
+ case NVPTX::BI__hmma_m32n8k16_ld_b:
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
+ case NVPTX::BI__hmma_m8n32k16_ld_a:
+ case NVPTX::BI__hmma_m8n32k16_ld_b:
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f32: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Value *Src = EmitScalarExpr(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
@@ -10044,31 +11753,70 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
unsigned NumResults;
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_ld_a:
- IID = isColMajor ? Intrinsic::nvvm_wmma_load_a_f16_col_stride
- : Intrinsic::nvvm_wmma_load_a_f16_row_stride;
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
+ : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride;
NumResults = 8;
break;
case NVPTX::BI__hmma_m16n16k16_ld_b:
- IID = isColMajor ? Intrinsic::nvvm_wmma_load_b_f16_col_stride
- : Intrinsic::nvvm_wmma_load_b_f16_row_stride;
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
+ : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride;
NumResults = 8;
break;
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f16_col_stride
- : Intrinsic::nvvm_wmma_load_c_f16_row_stride;
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
+ : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride;
NumResults = 4;
break;
case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f32_col_stride
- : Intrinsic::nvvm_wmma_load_c_f32_row_stride;
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
+ : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_a:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_b:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_a:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_b:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride;
+ NumResults = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride;
NumResults = 8;
break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
Value *Result =
- Builder.CreateCall(CGM.getIntrinsic(IID),
- {Builder.CreatePointerCast(Src, VoidPtrTy), Ldm});
+ Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
// Save returned values.
for (unsigned i = 0; i < NumResults; ++i) {
@@ -10082,7 +11830,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
}
case NVPTX::BI__hmma_m16n16k16_st_c_f16:
- case NVPTX::BI__hmma_m16n16k16_st_c_f32: {
+ case NVPTX::BI__hmma_m16n16k16_st_c_f32:
+ case NVPTX::BI__hmma_m32n8k16_st_c_f16:
+ case NVPTX::BI__hmma_m32n8k16_st_c_f32:
+ case NVPTX::BI__hmma_m8n32k16_st_c_f16:
+ case NVPTX::BI__hmma_m8n32k16_st_c_f32: {
Value *Dst = EmitScalarExpr(E->getArg(0));
Address Src = EmitPointerWithAlignment(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
@@ -10096,21 +11848,38 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
// for some reason nvcc builtins use _c_.
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_st_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f16_col_stride
- : Intrinsic::nvvm_wmma_store_d_f16_row_stride;
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
+ : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride;
NumResults = 4;
break;
case NVPTX::BI__hmma_m16n16k16_st_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_store_d_f32_col_stride
- : Intrinsic::nvvm_wmma_store_d_f32_row_stride;
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
+ : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_st_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_st_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
+ : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_st_c_f16:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride;
+ NumResults = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_st_c_f32:
+ IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
+ : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride;
break;
default:
llvm_unreachable("Unexpected builtin ID.");
}
- Function *Intrinsic = CGM.getIntrinsic(IID);
+ Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType());
llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
- SmallVector<Value *, 10> Values;
- Values.push_back(Builder.CreatePointerCast(Dst, VoidPtrTy));
+ SmallVector<Value *, 10> Values = {Dst};
for (unsigned i = 0; i < NumResults; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)),
@@ -10122,12 +11891,20 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return Result;
}
- // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf)
- // --> Intrinsic::nvvm_wmma_mma_sync<layout A,B><DType><CType><Satf>
+ // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
+ // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
- case NVPTX::BI__hmma_m16n16k16_mma_f16f32: {
+ case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f32: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Address SrcA = EmitPointerWithAlignment(E->getArg(1));
Address SrcB = EmitPointerWithAlignment(E->getArg(2));
@@ -10144,15 +11921,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
bool Satf = SatfArg.getSExtValue();
// clang-format off
-#define MMA_VARIANTS(type) {{ \
- Intrinsic::nvvm_wmma_mma_sync_row_row_##type, \
- Intrinsic::nvvm_wmma_mma_sync_row_row_##type##_satfinite, \
- Intrinsic::nvvm_wmma_mma_sync_row_col_##type, \
- Intrinsic::nvvm_wmma_mma_sync_row_col_##type##_satfinite, \
- Intrinsic::nvvm_wmma_mma_sync_col_row_##type, \
- Intrinsic::nvvm_wmma_mma_sync_col_row_##type##_satfinite, \
- Intrinsic::nvvm_wmma_mma_sync_col_col_##type, \
- Intrinsic::nvvm_wmma_mma_sync_col_col_##type##_satfinite \
+#define MMA_VARIANTS(geom, type) {{ \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \
}}
// clang-format on
@@ -10166,22 +11943,62 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
unsigned NumEltsD;
switch (BuiltinID) {
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(f16_f16));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16));
NumEltsC = 4;
NumEltsD = 4;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(f32_f16));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16));
NumEltsC = 4;
NumEltsD = 8;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(f16_f32));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32));
NumEltsC = 8;
NumEltsD = 4;
break;
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(f32_f32));
+ IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32));
+ NumEltsC = 8;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16));
+ NumEltsC = 4;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16));
+ NumEltsC = 4;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32));
+ NumEltsC = 8;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32));
+ NumEltsC = 8;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16));
+ NumEltsC = 4;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16));
+ NumEltsC = 4;
+ NumEltsD = 8;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32));
+ NumEltsC = 8;
+ NumEltsD = 4;
+ break;
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
+ IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32));
NumEltsC = 8;
NumEltsD = 8;
break;
@@ -10235,6 +12052,36 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
switch (BuiltinID) {
+ case WebAssembly::BI__builtin_wasm_memory_size: {
+ llvm::Type *ResultType = ConvertType(E->getType());
+ Value *I = EmitScalarExpr(E->getArg(0));
+ Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
+ return Builder.CreateCall(Callee, I);
+ }
+ case WebAssembly::BI__builtin_wasm_memory_grow: {
+ llvm::Type *ResultType = ConvertType(E->getType());
+ Value *Args[] = {
+ EmitScalarExpr(E->getArg(0)),
+ EmitScalarExpr(E->getArg(1))
+ };
+ Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType);
+ return Builder.CreateCall(Callee, Args);
+ }
+ case WebAssembly::BI__builtin_wasm_mem_size: {
+ llvm::Type *ResultType = ConvertType(E->getType());
+ Value *I = EmitScalarExpr(E->getArg(0));
+ Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_size, ResultType);
+ return Builder.CreateCall(Callee, I);
+ }
+ case WebAssembly::BI__builtin_wasm_mem_grow: {
+ llvm::Type *ResultType = ConvertType(E->getType());
+ Value *Args[] = {
+ EmitScalarExpr(E->getArg(0)),
+ EmitScalarExpr(E->getArg(1))
+ };
+ Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_mem_grow, ResultType);
+ return Builder.CreateCall(Callee, Args);
+ }
case WebAssembly::BI__builtin_wasm_current_memory: {
llvm::Type *ResultType = ConvertType(E->getType());
Value *Callee = CGM.getIntrinsic(Intrinsic::wasm_current_memory, ResultType);
@@ -10266,6 +12113,93 @@ Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
SmallVector<llvm::Value *, 4> Ops;
Intrinsic::ID ID = Intrinsic::not_intrinsic;
+ auto MakeCircLd = [&](unsigned IntID, bool HasImm) {
+ // The base pointer is passed by address, so it needs to be loaded.
+ Address BP = EmitPointerWithAlignment(E->getArg(0));
+ BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy),
+ BP.getAlignment());
+ llvm::Value *Base = Builder.CreateLoad(BP);
+ // Operands are Base, Increment, Modifier, Start.
+ if (HasImm)
+ Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)),
+ EmitScalarExpr(E->getArg(3)) };
+ else
+ Ops = { Base, EmitScalarExpr(E->getArg(1)),
+ EmitScalarExpr(E->getArg(2)) };
+
+ llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
+ llvm::Value *NewBase = Builder.CreateExtractValue(Result, 1);
+ llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+ NewBase->getType()->getPointerTo());
+ Address Dest = EmitPointerWithAlignment(E->getArg(0));
+ // The intrinsic generates two results. The new value for the base pointer
+ // needs to be stored.
+ Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
+ return Builder.CreateExtractValue(Result, 0);
+ };
+
+ auto MakeCircSt = [&](unsigned IntID, bool HasImm) {
+ // The base pointer is passed by address, so it needs to be loaded.
+ Address BP = EmitPointerWithAlignment(E->getArg(0));
+ BP = Address(Builder.CreateBitCast(BP.getPointer(), Int8PtrPtrTy),
+ BP.getAlignment());
+ llvm::Value *Base = Builder.CreateLoad(BP);
+ // Operands are Base, Increment, Modifier, Value, Start.
+ if (HasImm)
+ Ops = { Base, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)),
+ EmitScalarExpr(E->getArg(3)), EmitScalarExpr(E->getArg(4)) };
+ else
+ Ops = { Base, EmitScalarExpr(E->getArg(1)),
+ EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3)) };
+
+ llvm::Value *NewBase = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
+ llvm::Value *LV = Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
+ NewBase->getType()->getPointerTo());
+ Address Dest = EmitPointerWithAlignment(E->getArg(0));
+ // The intrinsic generates one result, which is the new value for the base
+ // pointer. It needs to be stored.
+ return Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
+ };
+
+ // Handle the conversion of bit-reverse load intrinsics to bit code.
+ // The intrinsic call after this function only reads from memory and the
+ // write to memory is dealt by the store instruction.
+ auto MakeBrevLd = [&](unsigned IntID, llvm::Type *DestTy) {
+ // The intrinsic generates one result, which is the new value for the base
+ // pointer. It needs to be returned. The result of the load instruction is
+ // passed to intrinsic by address, so the value needs to be stored.
+ llvm::Value *BaseAddress =
+ Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), Int8PtrTy);
+
+ // Expressions like &(*pt++) will be incremented per evaluation.
+ // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
+ // per call.
+ Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
+ DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), Int8PtrTy),
+ DestAddr.getAlignment());
+ llvm::Value *DestAddress = DestAddr.getPointer();
+
+ // Operands are Base, Dest, Modifier.
+ // The intrinsic format in LLVM IR is defined as
+ // { ValueType, i8* } (i8*, i32).
+ Ops = {BaseAddress, EmitScalarExpr(E->getArg(2))};
+
+ llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
+ // The value needs to be stored as the variable is passed by reference.
+ llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
+
+ // The store needs to be truncated to fit the destination type.
+ // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
+ // to be handled with stores of respective destination type.
+ DestVal = Builder.CreateTrunc(DestVal, DestTy);
+
+ llvm::Value *DestForStore =
+ Builder.CreateBitCast(DestAddress, DestVal->getType()->getPointerTo());
+ Builder.CreateAlignedStore(DestVal, DestForStore, DestAddr.getAlignment());
+ // The updated value of the base pointer is returned.
+ return Builder.CreateExtractValue(Result, 1);
+ };
+
switch (BuiltinID) {
case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B: {
@@ -10311,6 +12245,64 @@ Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
Builder.CreateAlignedStore(Vprd, Base, Dest.getAlignment());
return Builder.CreateExtractValue(Result, 0);
}
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadri_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrub_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrb_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadruh_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrh_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadri_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
+ return MakeCircLd(Intrinsic::hexagon_L2_loadrd_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerb_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerh_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerf_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
+ return MakeCircSt(Intrinsic::hexagon_S2_storeri_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerd_pci, /*HasImm*/true);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerb_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerh_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerf_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
+ return MakeCircSt(Intrinsic::hexagon_S2_storeri_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
+ return MakeCircSt(Intrinsic::hexagon_S2_storerd_pcr, /*HasImm*/false);
+ case Hexagon::BI__builtin_brev_ldub:
+ return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
+ case Hexagon::BI__builtin_brev_ldb:
+ return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
+ case Hexagon::BI__builtin_brev_lduh:
+ return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
+ case Hexagon::BI__builtin_brev_ldh:
+ return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
+ case Hexagon::BI__builtin_brev_ldw:
+ return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
+ case Hexagon::BI__builtin_brev_ldd:
+ return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
+ default:
+ break;
} // switch
return nullptr;