diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index af4cc9da9c9f4..bc37cedb435ae 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -261,6 +261,82 @@ TEST_P(MCPlusBuilderTester, testAccessedRegsMultipleDefs) {
                 {AArch64::W5, AArch64::X5, AArch64::W5_HI});
 }
 
+TEST_P(MCPlusBuilderTester, AArch64_Psign_Pauth_variants) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  MCInst Paciasp = MCInstBuilder(AArch64::PACIASP);
+  MCInst Pacibsp = MCInstBuilder(AArch64::PACIBSP);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Paciasp));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Pacibsp));
+
+  MCInst PaciaSPLR =
+      MCInstBuilder(AArch64::PACIA).addReg(AArch64::LR).addReg(AArch64::SP);
+  MCInst PacibSPLR =
+      MCInstBuilder(AArch64::PACIB).addReg(AArch64::LR).addReg(AArch64::SP);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(PaciaSPLR));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(PacibSPLR));
+
+  MCInst PacizaX5 = MCInstBuilder(AArch64::PACIZA).addReg(AArch64::X5);
+  MCInst PacizbX5 = MCInstBuilder(AArch64::PACIZB).addReg(AArch64::X5);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(PacizaX5));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(PacizbX5));
+
+  MCInst Paciaz = MCInstBuilder(AArch64::PACIZA).addReg(AArch64::LR);
+  MCInst Pacibz = MCInstBuilder(AArch64::PACIZB).addReg(AArch64::LR);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Paciaz));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Pacibz));
+
+  MCInst Pacia1716 = MCInstBuilder(AArch64::PACIA1716);
+  MCInst Pacib1716 = MCInstBuilder(AArch64::PACIB1716);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacia1716));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacib1716));
+
+  MCInst Pacia171615 = MCInstBuilder(AArch64::PACIA171615);
+  MCInst Pacib171615 = MCInstBuilder(AArch64::PACIB171615);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacia171615));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacib171615));
+
+  MCInst Autiasp = MCInstBuilder(AArch64::AUTIASP);
+  MCInst Autibsp = MCInstBuilder(AArch64::AUTIBSP);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autiasp));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autibsp));
+
+  MCInst AutiaSPLR =
+      MCInstBuilder(AArch64::AUTIA).addReg(AArch64::LR).addReg(AArch64::SP);
+  MCInst AutibSPLR =
+      MCInstBuilder(AArch64::AUTIB).addReg(AArch64::LR).addReg(AArch64::SP);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(AutiaSPLR));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(AutibSPLR));
+
+  MCInst AutizaX5 = MCInstBuilder(AArch64::AUTIZA).addReg(AArch64::X5);
+  MCInst AutizbX5 = MCInstBuilder(AArch64::AUTIZB).addReg(AArch64::X5);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(AutizaX5));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(AutizbX5));
+
+  MCInst Autiaz = MCInstBuilder(AArch64::AUTIZA).addReg(AArch64::LR);
+  MCInst Autibz = MCInstBuilder(AArch64::AUTIZB).addReg(AArch64::LR);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autiaz));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autibz));
+
+  MCInst Autia1716 = MCInstBuilder(AArch64::AUTIA1716);
+  MCInst Autib1716 = MCInstBuilder(AArch64::AUTIB1716);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autia1716));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autib1716));
+
+  MCInst Autia171615 = MCInstBuilder(AArch64::AUTIA171615);
+  MCInst Autib171615 = MCInstBuilder(AArch64::AUTIB171615);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autia171615));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autib171615));
+
+  MCInst Retaa = MCInstBuilder(AArch64::RETAA);
+  MCInst Retab = MCInstBuilder(AArch64::RETAB);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Retaa));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Retab));
+  ASSERT_TRUE(BC->MIB->isPAuthAndRet(Retaa));
+  ASSERT_TRUE(BC->MIB->isPAuthAndRet(Retab));
+}
+
 #endif // AARCH64_AVAILABLE
 
 #ifdef X86_AVAILABLE
diff --git a/clang/cmake/modules/CMakeLists.txt b/clang/cmake/modules/CMakeLists.txt
index d2d68121371bf..9ad2f984f0e27 100644
--- a/clang/cmake/modules/CMakeLists.txt
+++ b/clang/cmake/modules/CMakeLists.txt
@@ -8,15 +8,19 @@ include(FindPrefixFromConfig)
 # the usual CMake convention seems to be ${Project}Targets.cmake.
 set(CLANG_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/clang" CACHE STRING
   "Path for CMake subdirectory for Clang (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/clang')")
-# CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
-set(clang_cmake_builddir "${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/clang")
 
 # Keep this in sync with llvm/cmake/CMakeLists.txt!
 set(LLVM_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/llvm" CACHE STRING
   "Path for CMake subdirectory for LLVM (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/llvm')")
 # CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
-string(REPLACE "${CMAKE_CFG_INTDIR}" "." llvm_cmake_builddir "${LLVM_LIBRARY_DIR}")
-set(llvm_cmake_builddir "${llvm_cmake_builddir}/cmake/llvm")
+string(REPLACE "${CMAKE_CFG_INTDIR}" "." llvm_builddir "${LLVM_LIBRARY_DIR}")
+set(llvm_cmake_builddir "${llvm_builddir}/cmake/llvm")
+if(CLANG_BUILT_STANDALONE)
+  # CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
+  set(clang_cmake_builddir "${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/clang")
+else()
+  set(clang_cmake_builddir "${llvm_builddir}/cmake/clang")
+endif()
 
 get_property(CLANG_EXPORTS GLOBAL PROPERTY CLANG_EXPORTS)
 export(TARGETS ${CLANG_EXPORTS} FILE ${clang_cmake_builddir}/ClangTargets.cmake)
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index d942578dd7596..dcfa4e3ccc401 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -205,6 +205,50 @@ pointers with a specified address space. If the option is set to false, then
 reports from the specific x86 address spaces 256, 257 and 258 are still
 suppressed, but null dereferences from other address spaces are reported.
 
+.. _core-NullPointerArithm:
+
+core.NullPointerArithm (C, C++)
+"""""""""""""""""""""""""""""""
+Check for undefined arithmetic operations with null pointers.
+
+The checker can detect the following cases:
+
+  - ``p + x`` and ``x + p`` where ``p`` is a null pointer and ``x`` is a nonzero
+    integer value.
+  - ``p - x`` where ``p`` is a null pointer and ``x`` is a nonzero integer
+    value.
+  - ``p1 - p2`` where one of ``p1`` and ``p2`` is null and the other a
+    non-null pointer.
+
+Result of these operations is undefined according to the standard.
+In the above listed cases, the checker will warn even if the expression
+described to be "nonzero" or "non-null" has unknown value, because it is likely
+that it can have non-zero value during the program execution.
+
+.. code-block:: c
+
+ void test1(int *p, int offset) {
+   if (p)
+     return;
+
+   int *p1 = p + offset; // warn: 'p' is null, 'offset' is unknown but likely non-zero
+ }
+
+ void test2(int *p, int offset) {
+   if (p) { } // this indicates that it is possible for 'p' to be null
+   if (offset == 0)
+     return;
+
+   int *p1 = p - offset; // warn: 'p' is null, 'offset' is known to be non-zero
+ }
+
+ void test3(char *p1, char *p2) {
+   if (p1)
+     return;
+
+   int a = p1 - p2; // warn: 'p1' is null, 'p2' can be likely non-null
+ }
+
 .. _core-StackAddressEscape:
 
 core.StackAddressEscape (C)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 279c0c7935e36..62c70fba946be 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1359,23 +1359,17 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect
   def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
-}
-
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
-}
-
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpconflictsi_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
 }
 
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
   def vpconflictsi_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
 }
 
-let Features = "avx512cd", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512cd", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpconflictdi_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
   def vpconflictsi_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
 }
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 4473c54d8d6e3..b83bbcdb85a8f 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -195,6 +195,11 @@ def NullDereferenceChecker
       HelpText<"Check for dereferences of null pointers">,
       Documentation<HasDocumentation>;
 
+def NullPointerArithmChecker
+    : Checker<"NullPointerArithm">,
+      HelpText<"Check for undefined arithmetic operations on null pointers">,
+      Documentation<HasDocumentation>;
+
 def NonNullParamChecker : Checker<"NonNullParamChecker">,
   HelpText<"Check for null pointers passed as arguments to a function whose "
            "arguments are references or marked with the 'nonnull' attribute">,
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 89043968915a9..a72282caf5e73 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1358,9 +1358,6 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm,
 
 void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED,
                        const APSInt &Value) {
-  if (S.EvaluatingDecl && !S.EvaluatingDecl->isConstexpr())
-    return;
-
   llvm::APInt Min;
   llvm::APInt Max;
   ED->getValueRange(Max, Min);
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 57cc705282d1b..812d25fc79490 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -3096,7 +3096,8 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   }
 
   if (Offset.isZero()) {
-    if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) {
+    if (const Descriptor *Desc = Ptr.getFieldDesc();
+        Desc && Desc->isArray() && Ptr.getIndex() == 0) {
       S.Stk.push<Pointer>(Ptr.atIndex(0).narrow());
       return true;
     }
@@ -3126,7 +3127,8 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   }
 
   if (Offset.isZero()) {
-    if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) {
+    if (const Descriptor *Desc = Ptr.getFieldDesc();
+        Desc && Desc->isArray() && Ptr.getIndex() == 0) {
       S.Stk.push<Pointer>(Ptr.atIndex(0).narrow());
       return true;
     }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index b69f3607e82d6..a0d2c764121d9 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3101,6 +3101,33 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
+                                            const CallExpr *Call) {
+  assert(Call->getNumArgs() == 1);
+
+  QualType Arg0Type = Call->getArg(0)->getType();
+  const auto *VecT = Arg0Type->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VecT->getElementType());
+  unsigned NumElems = VecT->getNumElements();
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  for (unsigned I = 0; I != NumElems; ++I) {
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      APSInt ElemI = Src.elem<T>(I).toAPSInt();
+      APInt ConflictMask(ElemI.getBitWidth(), 0);
+      for (unsigned J = 0; J != I; ++J) {
+        APSInt ElemJ = Src.elem<T>(J).toAPSInt();
+        ConflictMask.setBitVal(J, ElemI == ElemJ);
+      }
+      Dst.elem<T>(I) = static_cast<T>(APSInt(ConflictMask, DestUnsigned));
+    });
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3891,7 +3918,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         [](const APSInt &Lo, const APSInt &Hi, const APSInt &Amt) {
           return llvm::APIntOps::fshr(Hi, Lo, Amt);
         });
-
+  case X86::BI__builtin_ia32_vpconflictsi_128:
+  case X86::BI__builtin_ia32_vpconflictsi_256:
+  case X86::BI__builtin_ia32_vpconflictsi_512:
+  case X86::BI__builtin_ia32_vpconflictdi_128:
+  case X86::BI__builtin_ia32_vpconflictdi_256:
+  case X86::BI__builtin_ia32_vpconflictdi_512:
+    return interp__builtin_ia32_vpconflict(S, OpPC, Call);
   case clang::X86::BI__builtin_ia32_blendpd:
   case clang::X86::BI__builtin_ia32_blendpd256:
   case clang::X86::BI__builtin_ia32_blendps:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a07eb2254e13f..16141b27f4ce8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12179,6 +12179,37 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_vpconflictsi_128:
+  case X86::BI__builtin_ia32_vpconflictsi_256:
+  case X86::BI__builtin_ia32_vpconflictsi_512:
+  case X86::BI__builtin_ia32_vpconflictdi_128:
+  case X86::BI__builtin_ia32_vpconflictdi_256:
+  case X86::BI__builtin_ia32_vpconflictdi_512: {
+    APValue Source;
+
+    if (!EvaluateAsRValue(Info, E->getArg(0), Source))
+      return false;
+
+    unsigned SourceLen = Source.getVectorLength();
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(SourceLen);
+
+    const auto *VecT = E->getType()->castAs<VectorType>();
+    bool DestUnsigned =
+        VecT->getElementType()->isUnsignedIntegerOrEnumerationType();
+
+    for (unsigned I = 0; I != SourceLen; ++I) {
+      const APValue &EltI = Source.getVectorElt(I);
+
+      APInt ConflictMask(EltI.getInt().getBitWidth(), 0);
+      for (unsigned J = 0; J != I; ++J) {
+        const APValue &EltJ = Source.getVectorElt(J);
+        ConflictMask.setBitVal(J, EltI.getInt() == EltJ.getInt());
+      }
+      ResultElements.push_back(APValue(APSInt(ConflictMask, DestUnsigned)));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case X86::BI__builtin_ia32_blendpd:
   case X86::BI__builtin_ia32_blendpd256:
   case X86::BI__builtin_ia32_blendps:
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 85c70de22e023..12e2813ef2ec7 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -647,6 +647,68 @@ StringRef CGDebugInfo::getCurrentDirname() {
   return CGM.getCodeGenOpts().DebugCompilationDir;
 }
 
+static llvm::dwarf::SourceLanguage GetSourceLanguage(const CodeGenModule &CGM) {
+  const CodeGenOptions &CGO = CGM.getCodeGenOpts();
+  const LangOptions &LO = CGM.getLangOpts();
+
+  assert(CGO.DwarfVersion <= 5);
+
+  llvm::dwarf::SourceLanguage LangTag;
+  if (LO.CPlusPlus) {
+    if (LO.ObjC)
+      LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
+    else if (CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
+    else if (LO.CPlusPlus14)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_14;
+    else if (LO.CPlusPlus11)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_11;
+    else
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
+  } else if (LO.ObjC) {
+    LangTag = llvm::dwarf::DW_LANG_ObjC;
+  } else if (LO.OpenCL && (!CGO.DebugStrictDwarf || CGO.DwarfVersion >= 5)) {
+    LangTag = llvm::dwarf::DW_LANG_OpenCL;
+  } else if (LO.C11 && !(CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)) {
+    LangTag = llvm::dwarf::DW_LANG_C11;
+  } else if (LO.C99) {
+    LangTag = llvm::dwarf::DW_LANG_C99;
+  } else {
+    LangTag = llvm::dwarf::DW_LANG_C89;
+  }
+
+  return LangTag;
+}
+
+static llvm::DISourceLanguageName
+GetDISourceLanguageName(const CodeGenModule &CGM) {
+  // Emit pre-DWARFv6 language codes.
+  if (CGM.getCodeGenOpts().DwarfVersion < 6)
+    return llvm::DISourceLanguageName(GetSourceLanguage(CGM));
+
+  const LangOptions &LO = CGM.getLangOpts();
+
+  uint32_t LangVersion = 0;
+  llvm::dwarf::SourceLanguageName LangTag;
+  if (LO.CPlusPlus) {
+    if (LO.ObjC) {
+      LangTag = llvm::dwarf::DW_LNAME_ObjC_plus_plus;
+    } else {
+      LangTag = llvm::dwarf::DW_LNAME_C_plus_plus;
+      LangVersion = LO.getCPlusPlusLangStd().value_or(0);
+    }
+  } else if (LO.ObjC) {
+    LangTag = llvm::dwarf::DW_LNAME_ObjC;
+  } else if (LO.OpenCL) {
+    LangTag = llvm::dwarf::DW_LNAME_OpenCL_C;
+  } else {
+    LangTag = llvm::dwarf::DW_LNAME_C;
+    LangVersion = LO.getCLangStd().value_or(0);
+  }
+
+  return llvm::DISourceLanguageName(LangTag, LangVersion);
+}
+
 void CGDebugInfo::CreateCompileUnit() {
   SmallString<64> Checksum;
   std::optional<llvm::DIFile::ChecksumKind> CSKind;
@@ -702,31 +764,6 @@ void CGDebugInfo::CreateCompileUnit() {
     }
   }
 
-  llvm::dwarf::SourceLanguage LangTag;
-  if (LO.CPlusPlus) {
-    if (LO.ObjC)
-      LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
-    else if (CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
-    else if (LO.CPlusPlus14)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_14;
-    else if (LO.CPlusPlus11)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_11;
-    else
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
-  } else if (LO.ObjC) {
-    LangTag = llvm::dwarf::DW_LANG_ObjC;
-  } else if (LO.OpenCL && (!CGM.getCodeGenOpts().DebugStrictDwarf ||
-                           CGM.getCodeGenOpts().DwarfVersion >= 5)) {
-    LangTag = llvm::dwarf::DW_LANG_OpenCL;
-  } else if (LO.C11 && !(CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)) {
-      LangTag = llvm::dwarf::DW_LANG_C11;
-  } else if (LO.C99) {
-    LangTag = llvm::dwarf::DW_LANG_C99;
-  } else {
-    LangTag = llvm::dwarf::DW_LANG_C89;
-  }
-
   std::string Producer = getClangFullVersion();
 
   // Figure out which version of the ObjC runtime we have.
@@ -787,7 +824,7 @@ void CGDebugInfo::CreateCompileUnit() {
 
   // Create new compile unit.
   TheCU = DBuilder.createCompileUnit(
-      llvm::DISourceLanguageName(LangTag), CUFile,
+      GetDISourceLanguageName(CGM), CUFile,
       CGOpts.EmitVersionIdentMetadata ? Producer : "",
       CGOpts.OptimizationLevel != 0 || CGOpts.PrepareForLTO ||
           CGOpts.PrepareForThinLTO,
@@ -1234,20 +1271,46 @@ llvm::DIType *CGDebugInfo::CreateType(const PointerType *Ty,
                                Ty->getPointeeType(), Unit);
 }
 
-/// \return whether a C++ mangling exists for the type defined by TD.
-static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
-  switch (TheCU->getSourceLanguage().getUnversionedName()) {
+static bool hasCXXMangling(llvm::dwarf::SourceLanguage Lang, bool IsTagDecl) {
+  switch (Lang) {
   case llvm::dwarf::DW_LANG_C_plus_plus:
   case llvm::dwarf::DW_LANG_C_plus_plus_11:
   case llvm::dwarf::DW_LANG_C_plus_plus_14:
     return true;
   case llvm::dwarf::DW_LANG_ObjC_plus_plus:
-    return isa<CXXRecordDecl>(TD) || isa<EnumDecl>(TD);
+    return IsTagDecl;
+  default:
+    return false;
+  }
+}
+
+static bool hasCXXMangling(llvm::dwarf::SourceLanguageName Lang,
+                           bool IsTagDecl) {
+  switch (Lang) {
+  case llvm::dwarf::DW_LNAME_C_plus_plus:
+    return true;
+  case llvm::dwarf::DW_LNAME_ObjC_plus_plus:
+    return IsTagDecl;
   default:
     return false;
   }
 }
 
+/// \return whether a C++ mangling exists for the type defined by TD.
+static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
+  const bool IsTagDecl = isa<CXXRecordDecl>(TD) || isa<EnumDecl>(TD);
+
+  if (llvm::DISourceLanguageName SourceLang = TheCU->getSourceLanguage();
+      SourceLang.hasVersionedName())
+    return hasCXXMangling(
+        static_cast<llvm::dwarf::SourceLanguageName>(SourceLang.getName()),
+        IsTagDecl);
+  else
+    return hasCXXMangling(
+        static_cast<llvm::dwarf::SourceLanguage>(SourceLang.getName()),
+        IsTagDecl);
+}
+
 // Determines if the debug info for this tag declaration needs a type
 // identifier. The purpose of the unique identifier is to deduplicate type
 // information for identical types across TUs. Because of the C++ one definition
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 1e58c3f217812..342a3af0ac1ee 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -82,6 +82,8 @@ TargetCodeGenInfo::~TargetCodeGenInfo() = default;
 // If someone can figure out a general rule for this, that would be great.
 // It's probably just doomed to be platform-dependent, though.
 unsigned TargetCodeGenInfo::getSizeOfUnwindException() const {
+  if (getABIInfo().getCodeGenOpts().hasSEHExceptions())
+    return getABIInfo().getDataLayout().getPointerSizeInBits() > 32 ? 64 : 48;
   // Verified for:
   //   x86-64     FreeBSD, Linux, Darwin
   //   x86-32     FreeBSD, Linux, Darwin
diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h
index b16144044d928..fb6dcb6dd8ad1 100644
--- a/clang/lib/Headers/avx512cdintrin.h
+++ b/clang/lib/Headers/avx512cdintrin.h
@@ -15,111 +15,98 @@
 #define __AVX512CDINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512cd"), __min_vector_width__(512)))
+#else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"),       \
                  __min_vector_width__(512)))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
 #endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi64 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
+_mm512_conflict_epi64(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpconflictdi_512((__v8di)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)__W);
+_mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_conflict_epi64(__A), (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
-{
+_mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                              (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512 ());
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi32 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
+_mm512_conflict_epi32(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpconflictsi_512((__v16si)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)__W);
+_mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_conflict_epi32(__A), (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)_mm512_setzero_si512());
+_mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_conflict_epi32(__A),
+      (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm512_lzcnt_epi32(__m512i __A) {
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_lzcnt_epi32(__m512i __A) {
   return (__m512i)__builtin_elementwise_clzg((__v16si)__A,
                                              (__v16si)_mm512_set1_epi32(32));
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_lzcnt_epi32(__A),
-                                             (__v16si)__W);
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_lzcnt_epi32(__A), (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                              (__v16si)_mm512_lzcnt_epi32(__A),
                                              (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm512_lzcnt_epi64(__m512i __A) {
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_lzcnt_epi64(__m512i __A) {
   return (__m512i)__builtin_elementwise_clzg(
       (__v8di)__A, (__v8di)_mm512_set1_epi64((long long)64));
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_lzcnt_epi64(__A),
-                                             (__v8di)__W);
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_lzcnt_epi64(__A), (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                              (__v8di)_mm512_lzcnt_epi64(__A),
                                              (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmb_epi64(__mmask8 __A) {
-  return (__m512i) _mm512_set1_epi64((long long) __A);
+  return (__m512i)_mm512_set1_epi64((long long)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmw_epi32(__mmask16 __A) {
   return (__m512i)_mm512_set1_epi32((int)__A);
 }
 
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h
index cb98e7c514bde..7719680faf93a 100644
--- a/clang/lib/Headers/avx512vlcdintrin.h
+++ b/clang/lib/Headers/avx512vlcdintrin.h
@@ -14,203 +14,182 @@
 #define __AVX512VLCDINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512vl,avx512cd"),                    \
+                           __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512vl,avx512cd"),                    \
+                           __min_vector_width__(256)))
+#else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
-#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
-#else
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
-#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
 #endif
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmb_epi64(__mmask8 __A) {
   return (__m128i) _mm_set1_epi64x((long long) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastmb_epi64(__mmask8 __A) {
-  return (__m256i) _mm256_set1_epi64x((long long)__A);
+  return (__m256i)_mm256_set1_epi64x((long long)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmw_epi32(__mmask16 __A) {
   return (__m128i) _mm_set1_epi32((int)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastmw_epi32(__mmask16 __A) {
   return (__m256i) _mm256_set1_epi32((int)__A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi64 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
+_mm_conflict_epi64(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpconflictdi_128((__v2di)__A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_conflict_epi64(__A),
-                                             (__v2di)__W);
+_mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_conflict_epi64(__A), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
-{
+_mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                              (__v2di)_mm_conflict_epi64(__A),
                                              (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi64 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
+_mm256_conflict_epi64(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpconflictdi_256((__v4di)__A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_conflict_epi64(__A),
-                                             (__v4di)__W);
+_mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_conflict_epi64(__A), (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
-{
+_mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                              (__v4di)_mm256_conflict_epi64(__A),
                                              (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
+_mm_conflict_epi32(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpconflictsi_128((__v4si)__A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_conflict_epi32(__A),
-                                             (__v4si)__W);
+_mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_conflict_epi32(__A), (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
-{
+_mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                              (__v4si)_mm_conflict_epi32(__A),
                                              (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi32 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
+_mm256_conflict_epi32(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpconflictsi_256((__v8si)__A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_conflict_epi32(__A),
-                                             (__v8si)__W);
+_mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_conflict_epi32(__A), (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
-{
+_mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                              (__v8si)_mm256_conflict_epi32(__A),
                                              (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_lzcnt_epi32(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_lzcnt_epi32(__m128i __A) {
   return (__m128i)__builtin_elementwise_clzg((__v4si)__A,
                                              (__v4si)_mm_set1_epi32(32));
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)__W);
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_lzcnt_epi32(__A), (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_lzcnt_epi32(__A), (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi32(__m256i __A) {
   return (__m256i)__builtin_elementwise_clzg((__v8si)__A,
                                              (__v8si)_mm256_set1_epi32(32));
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_lzcnt_epi32(__A),
-                                             (__v8si)__W);
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_lzcnt_epi32(__A), (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                              (__v8si)_mm256_lzcnt_epi32(__A),
                                              (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_lzcnt_epi64(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_lzcnt_epi64(__m128i __A) {
   return (__m128i)__builtin_elementwise_clzg(
       (__v2di)__A, (__v2di)_mm_set1_epi64x((long long)64));
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)__W);
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_lzcnt_epi64(__A), (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_lzcnt_epi64(__A), (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi64(__m256i __A) {
   return (__m256i)__builtin_elementwise_clzg(
       (__v4di)__A, (__v4di)_mm256_set1_epi64x((long long)64));
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_lzcnt_epi64(__A),
-                                             (__v4di)__W);
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_lzcnt_epi64(__A), (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                              (__v4di)_mm256_lzcnt_epi64(__A),
@@ -219,7 +198,5 @@ _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
-#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif /* __AVX512VLCDINTRIN_H */
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index 395d724cdfd11..37f5ec3557400 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -19,6 +19,7 @@
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -39,9 +40,10 @@ class DerefBugType : public BugType {
 
 class DereferenceChecker
     : public CheckerFamily<check::Location, check::Bind,
+                           check::PreStmt<BinaryOperator>,
                            EventDispatcher<ImplicitNullDerefEvent>> {
-  void reportBug(const DerefBugType &BT, ProgramStateRef State, const Stmt *S,
-                 CheckerContext &C) const;
+  void reportDerefBug(const DerefBugType &BT, ProgramStateRef State,
+                      const Stmt *S, CheckerContext &C) const;
 
   bool suppressReport(CheckerContext &C, const Expr *E) const;
 
@@ -50,6 +52,7 @@ class DereferenceChecker
                      CheckerContext &C) const;
   void checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
                  CheckerContext &C) const;
+  void checkPreStmt(const BinaryOperator *Op, CheckerContext &C) const;
 
   static void AddDerefSource(raw_ostream &os,
                              SmallVectorImpl<SourceRange> &Ranges,
@@ -57,7 +60,7 @@ class DereferenceChecker
                              const LocationContext *LCtx,
                              bool loadedFrom = false);
 
-  CheckerFrontend NullDerefChecker, FixedDerefChecker;
+  CheckerFrontend NullDerefChecker, FixedDerefChecker, NullPointerArithmChecker;
   const DerefBugType NullBug{&NullDerefChecker, "Dereference of null pointer",
                              "a null pointer dereference",
                              "a dereference of a null pointer"};
@@ -72,9 +75,22 @@ class DereferenceChecker
   const DerefBugType FixedAddressBug{&FixedDerefChecker,
                                      "Dereference of a fixed address",
                                      "a dereference of a fixed address"};
+  const BugType NullPointerArithmBug{
+      &NullPointerArithmChecker,
+      "Possibly undefined arithmetic operation involving a null pointer"};
 
   StringRef getDebugTag() const override { return "DereferenceChecker"; }
 };
+
+struct ValueDescStr {
+  SmallVectorImpl<SourceRange> &Ranges;
+  const Expr *Ex;
+  const ProgramState *State;
+  const LocationContext *LCtx;
+  bool IsPointer;
+  ConditionTruthVal IsNull;
+};
+
 } // end anonymous namespace
 
 void
@@ -173,9 +189,9 @@ static bool isDeclRefExprToReference(const Expr *E) {
   return false;
 }
 
-void DereferenceChecker::reportBug(const DerefBugType &BT,
-                                   ProgramStateRef State, const Stmt *S,
-                                   CheckerContext &C) const {
+void DereferenceChecker::reportDerefBug(const DerefBugType &BT,
+                                        ProgramStateRef State, const Stmt *S,
+                                        CheckerContext &C) const {
   if (&BT == &FixedAddressBug) {
     if (!FixedDerefChecker.isEnabled())
       // Deliberately don't add a sink node if check is disabled.
@@ -249,9 +265,8 @@ void DereferenceChecker::reportBug(const DerefBugType &BT,
 
   bugreporter::trackExpressionValue(N, bugreporter::getDerefExpr(S), *BR);
 
-  for (SmallVectorImpl<SourceRange>::iterator
-       I = Ranges.begin(), E = Ranges.end(); I!=E; ++I)
-    BR->addRange(*I);
+  for (const auto &R : Ranges)
+    BR->addRange(R);
 
   C.emitReport(std::move(BR));
 }
@@ -262,7 +277,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
   if (l.isUndef()) {
     const Expr *DerefExpr = getDereferenceExpr(S);
     if (!suppressReport(C, DerefExpr))
-      reportBug(UndefBug, C.getState(), DerefExpr, C);
+      reportDerefBug(UndefBug, C.getState(), DerefExpr, C);
     return;
   }
 
@@ -283,7 +298,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
       // we call an "explicit" null dereference.
       const Expr *expr = getDereferenceExpr(S);
       if (!suppressReport(C, expr)) {
-        reportBug(NullBug, nullState, expr, C);
+        reportDerefBug(NullBug, nullState, expr, C);
         return;
       }
     }
@@ -301,7 +316,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
   if (location.isConstant()) {
     const Expr *DerefExpr = getDereferenceExpr(S, isLoad);
     if (!suppressReport(C, DerefExpr))
-      reportBug(FixedAddressBug, notNullState, DerefExpr, C);
+      reportDerefBug(FixedAddressBug, notNullState, DerefExpr, C);
     return;
   }
 
@@ -317,7 +332,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
 
   // One should never write to label addresses.
   if (auto Label = L.getAs<loc::GotoLabel>()) {
-    reportBug(LabelBug, C.getState(), S, C);
+    reportDerefBug(LabelBug, C.getState(), S, C);
     return;
   }
 
@@ -338,7 +353,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
     if (!StNonNull) {
       const Expr *expr = getDereferenceExpr(S, /*IsBind=*/true);
       if (!suppressReport(C, expr)) {
-        reportBug(NullBug, StNull, expr, C);
+        reportDerefBug(NullBug, StNull, expr, C);
         return;
       }
     }
@@ -356,7 +371,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   if (V.isConstant()) {
     const Expr *DerefExpr = getDereferenceExpr(S, true);
     if (!suppressReport(C, DerefExpr))
-      reportBug(FixedAddressBug, State, DerefExpr, C);
+      reportDerefBug(FixedAddressBug, State, DerefExpr, C);
     return;
   }
 
@@ -379,6 +394,96 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   C.addTransition(State, this);
 }
 
+namespace llvm {
+template <> struct format_provider<ValueDescStr> {
+  static void format(const ValueDescStr &V, raw_ostream &Stream,
+                     StringRef Style) {
+    static const char *ValueStr[2][3] = {
+        {"zero", "nonzero integer value", "probably nonzero integer value"},
+        {"null pointer", "non-null pointer", "probably non-null pointer"},
+    };
+    Stream
+        << ValueStr[V.IsPointer][V.IsNull.isConstrainedTrue()
+                                     ? 0
+                                     : (V.IsNull.isConstrainedFalse() ? 1 : 2)];
+    DereferenceChecker::AddDerefSource(Stream, V.Ranges, V.Ex, V.State, V.LCtx,
+                                       false);
+  }
+};
+} // namespace llvm
+
+void DereferenceChecker::checkPreStmt(const BinaryOperator *Op,
+                                      CheckerContext &C) const {
+  if (!Op->isAdditiveOp() || !NullPointerArithmChecker.isEnabled())
+    return;
+  const Expr *E1 = Op->getLHS();
+  const Expr *E2 = Op->getRHS();
+  QualType T1 = E1->getType().getCanonicalType();
+  QualType T2 = E2->getType().getCanonicalType();
+  bool T1IsPointer = T1->isPointerType();
+  bool T2IsPointer = T2->isPointerType();
+  if (T1->isIntegerType() && T2->isIntegerType())
+    return;
+  if (!T1IsPointer && !T1->isIntegerType() && !T2IsPointer &&
+      !T2->isIntegerType())
+    return;
+
+  ProgramStateRef State = C.getState();
+  ConditionTruthVal V1IsNull = State->isNull(C.getSVal(E1));
+  ConditionTruthVal V2IsNull = State->isNull(C.getSVal(E2));
+  bool IsConstrained = true;
+
+  // Check cases 'NULL + x' and 'NULL - x'
+  if (T1IsPointer && !T2IsPointer) {
+    if (!V1IsNull.isConstrainedTrue() || V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained = V2IsNull.isConstrainedFalse();
+  }
+
+  // Check case 'x + NULL'
+  if (!T1IsPointer && T2IsPointer) {
+    if (V1IsNull.isConstrainedTrue() || !V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained = V1IsNull.isConstrainedFalse();
+  }
+
+  // Check case 'NULL - p' or 'p - NULL'
+  if (T1IsPointer && T2IsPointer) {
+    if (!V1IsNull.isConstrainedTrue() && !V2IsNull.isConstrainedTrue())
+      return;
+    if (V1IsNull.isConstrainedTrue() && V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained =
+        V1IsNull.isConstrainedFalse() || V2IsNull.isConstrainedFalse();
+  }
+
+  SmallVector<SourceRange, 2> Ranges;
+  const char *OpcodeStr =
+      Op->getOpcode() == BO_Add ? "Addition" : "Subtraction";
+  const char *ResultStr = IsConstrained ? "results" : "may result";
+  ValueDescStr DerefArg1{
+      Ranges, E1, State.get(), C.getLocationContext(), T1IsPointer, V1IsNull};
+  ValueDescStr DerefArg2{
+      Ranges, E2, State.get(), C.getLocationContext(), T2IsPointer, V2IsNull};
+  std::string Msg =
+      llvm::formatv("{0} of a {1} and a {2} {3} in undefined behavior",
+                    OpcodeStr, DerefArg1, DerefArg2, ResultStr);
+
+  ExplodedNode *N = C.generateErrorNode(State);
+  if (!N)
+    return;
+  auto BR =
+      std::make_unique<PathSensitiveBugReport>(NullPointerArithmBug, Msg, N);
+  if (V1IsNull.isConstrainedTrue())
+    bugreporter::trackExpressionValue(N, E1, *BR);
+  if (V2IsNull.isConstrainedTrue())
+    bugreporter::trackExpressionValue(N, E2, *BR);
+  for (const auto &R : Ranges)
+    BR->addRange(R);
+
+  C.emitReport(std::move(BR));
+}
+
 void ento::registerNullDereferenceChecker(CheckerManager &Mgr) {
   Mgr.getChecker<DereferenceChecker>()->NullDerefChecker.enable(Mgr);
 }
@@ -395,3 +500,11 @@ bool ento::shouldRegisterFixedAddressDereferenceChecker(
     const CheckerManager &) {
   return true;
 }
+
+void ento::registerNullPointerArithmChecker(CheckerManager &Mgr) {
+  Mgr.getChecker<DereferenceChecker>()->NullPointerArithmChecker.enable(Mgr);
+}
+
+bool ento::shouldRegisterNullPointerArithmChecker(const CheckerManager &) {
+  return true;
+}
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 72bc7622eb6d8..8efd3201d6200 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -146,6 +146,14 @@ void testValueInRangeOfEnumerationValues() {
 
   const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context
 }
+struct EnumTest {
+  enum type {
+      Type1,
+      BOUND
+  };
+  static const type binding_completed = type(BOUND + 1); // both-error {{in-class initializer for static data member is not a constant expression}} \
+                                                         // both-note {{integer value 2 is outside the valid range of values}}
+};
 
 template<class T, unsigned size> struct Bitfield {
   static constexpr T max = static_cast<T>((1 << size) - 1);
diff --git a/clang/test/AST/ByteCode/typeid.cpp b/clang/test/AST/ByteCode/typeid.cpp
index 00b01c8e40682..090309d16e737 100644
--- a/clang/test/AST/ByteCode/typeid.cpp
+++ b/clang/test/AST/ByteCode/typeid.cpp
@@ -59,3 +59,13 @@ namespace TypeidPtrInEvaluationResult {
   consteval const std::type_info *ftype_info() { return &typeid(c); }
   const std::type_info *T1 = ftype_info();
 }
+
+// Regression test for crash in ArrayElemPtrPop with typeid pointers. GH-163127
+namespace TypeidPtrRegression {
+  void dontcrash() {
+    // this should just be an error and not an ICE
+    constexpr auto res = ((void**)&typeid(int))[0]; // both-error {{must be initialized by a constant expression}} \
+                                                                // both-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}}
+  }
+}
+
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 009233108a70a..bfe418b112a9d 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -19,6 +19,7 @@
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
+// CHECK-NEXT: core.NullPointerArithm
 // CHECK-NEXT: core.StackAddressEscape
 // CHECK-NEXT: core.UndefinedBinaryOperatorResult
 // CHECK-NEXT: core.VLASize
diff --git a/clang/test/Analysis/null-pointer-arithm.c b/clang/test/Analysis/null-pointer-arithm.c
new file mode 100644
index 0000000000000..228824767937f
--- /dev/null
+++ b/clang/test/Analysis/null-pointer-arithm.c
@@ -0,0 +1,76 @@
+// RUN: %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=core
+
+extern int *get_pointer();
+
+int *test_add1(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a probably nonzero integer value (from variable 'offset') may result in undefined behavior}}
+}
+
+int *test_add2(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset) {}
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a nonzero integer value (from variable 'offset') results in undefined behavior}}
+}
+
+int *test_add3(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset != 0) return 0;
+  return p + offset;
+}
+
+int *test_add4(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset == 0) return 0;
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a nonzero integer value (from variable 'offset') results in undefined behavior}}
+}
+
+int *test_add5(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return offset + p; // expected-warning{{Addition of a probably nonzero integer value (from variable 'offset') and a null pointer (from variable 'p') may result in undefined behavior}}
+}
+
+int *test_sub1(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return p - offset; // expected-warning{{Subtraction of a null pointer (from variable 'p') and a probably nonzero integer value (from variable 'offset') may result in undefined behavior}}
+}
+
+int test_sub_p1() {
+  int *p = get_pointer();
+  if (p) {}
+  return p - p;
+}
+
+int test_sub_p2() {
+  int *p1 = get_pointer();
+  int *p2 = get_pointer();
+  if (p1) {}
+  if (p2) {}
+  return p1 - p2;
+  // expected-warning@-1{{Subtraction of a non-null pointer (from variable 'p1') and a null pointer (from variable 'p2') results in undefined behavior}}
+  // expected-warning@-2{{Subtraction of a null pointer (from variable 'p1') and a non-null pointer (from variable 'p2') results in undefined behavior}}
+}
+
+int test_sub_p3() {
+  int *p1 = get_pointer();
+  int *p2 = get_pointer();
+  if (p1) {}
+  return p1 - p2; // expected-warning{{Subtraction of a null pointer (from variable 'p1') and a probably non-null pointer (from variable 'p2') may result in undefined behavior}}
+}
+
+struct S {
+  char *p;
+  int offset;
+};
+
+char *test_struct(struct S s) {
+  if (s.p) {}
+  return s.p + s.offset; // expected-warning{{Addition of a null pointer (via field 'p') and a probably nonzero integer value (via field 'offset') may result in undefined behavior}}
+}
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 7fae958f6afc6..9b3296064981f 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -27,6 +27,7 @@
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
+// CHECK-NEXT: core.NullPointerArithm
 // CHECK-NEXT: core.StackAddressEscape
 // CHECK-NEXT: core.UndefinedBinaryOperatorResult
 // CHECK-NEXT: core.VLASize
diff --git a/clang/test/C/C2y/n3364.c b/clang/test/C/C2y/n3364.c
index f95c77fb3018f..ccf7e8d491346 100644
--- a/clang/test/C/C2y/n3364.c
+++ b/clang/test/C/C2y/n3364.c
@@ -37,6 +37,6 @@ double d3 = -DBL_SNAN;
 long double ld1 = LDBL_SNAN;
 long double ld2 = +LDBL_SNAN;
 long double ld3 = -LDBL_SNAN;
-// CHECK: @ld1 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000}}
-// CHECK: @ld2 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000}}
-// CHECK: @ld3 = {{.*}}global {{double 0xFFF4000000000000|x86_fp80 0xKFFFFA000000000000000|fp128 0xL0000000000000000FFFF400000000000}}
+// CHECK: @ld1 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000|ppc_fp128 0xM7FF40000000000000000000000000000}}
+// CHECK: @ld2 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000|ppc_fp128 0xM7FF40000000000000000000000000000}}
+// CHECK: @ld3 = {{.*}}global {{double 0xFFF4000000000000|x86_fp80 0xKFFFFA000000000000000|fp128 0xL0000000000000000FFFF400000000000|ppc_fp128 0xMFFF40000000000008000000000000000}}
diff --git a/clang/test/CodeGen/X86/avx512cd-builtins.c b/clang/test/CodeGen/X86/avx512cd-builtins.c
index 2890889348c87..80a20b1244532 100644
--- a/clang/test/CodeGen/X86/avx512cd-builtins.c
+++ b/clang/test/CodeGen/X86/avx512cd-builtins.c
@@ -14,37 +14,53 @@
 __m512i test_mm512_conflict_epi64(__m512i __A) {
   // CHECK-LABEL: test_mm512_conflict_epi64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %{{.*}})
-  return _mm512_conflict_epi64(__A); 
+  return _mm512_conflict_epi64(__A);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_conflict_epi64((__m512i)(__v8di){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
+TEST_CONSTEXPR(match_v8di(_mm512_conflict_epi64((__m512i)(__v8di){5, 5, 5, 5, 5, 5, 5, 5}), 0, 1, 3, 7, 15, 31, 63, 127));
+TEST_CONSTEXPR(match_v8di(_mm512_conflict_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 0, 0, 0, 0));
 __m512i test_mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_conflict_epi64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_conflict_epi64(__W,__U,__A); 
+  return _mm512_mask_conflict_epi64(__W,__U,__A);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_mask_conflict_epi64((__m512i)(__v8di){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, 0x55, (__m512i)(__v8di){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0xFF, 1, 0xFF, 2, 0xFF, 5, 0xFF));
 __m512i test_mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_conflict_epi64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_conflict_epi64(__U,__A); 
+  return _mm512_maskz_conflict_epi64(__U,__A);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_conflict_epi64(0x55, (__m512i)(__v8di){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
 __m512i test_mm512_conflict_epi32(__m512i __A) {
   // CHECK-LABEL: test_mm512_conflict_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %{{.*}})
-  return _mm512_conflict_epi32(__A); 
+  return _mm512_conflict_epi32(__A);
 }
+
+TEST_CONSTEXPR(match_v16si(_mm512_conflict_epi32((__m512i)(__v16si){1, 2, 1, 3, 2, 4, 1, 5, 6, 7, 6, 8, 7, 9, 6, 10}), 0, 0, 1, 0, 2, 0, 5, 0, 0, 0, 256, 0, 512, 0, 1280, 0));
+TEST_CONSTEXPR(match_v16si(_mm512_conflict_epi32((__m512i)(__v16si){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}), 0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767));
+TEST_CONSTEXPR(match_v16si(_mm512_conflict_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 __m512i test_mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_conflict_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_conflict_epi32(__W,__U,__A); 
+  return _mm512_mask_conflict_epi32(__W,__U,__A);
 }
+
+TEST_CONSTEXPR(match_v16si(_mm512_mask_conflict_epi32((__m512i)(__v16si){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, 0x5555, (__m512i)(__v16si){1, 2, 1, 3, 2, 4, 1, 5, 6, 7, 6, 8, 7, 9, 6, 10}), 0, 0xFF, 1, 0xFF, 2, 0xFF, 5, 0xFF, 0, 0xFF, 256, 0xFF, 512, 0xFF, 1280, 0xFF));
 __m512i test_mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_conflict_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_conflict_epi32(__U,__A); 
+  return _mm512_maskz_conflict_epi32(__U,__A);
 }
+
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_conflict_epi32(0x5555, (__m512i)(__v16si){1, 2, 1, 3, 2, 4, 1, 5, 6, 7, 6, 8, 7, 9, 6, 10}), 0, 0, 1, 0, 2, 0, 5, 0, 0, 0, 256, 0, 512, 0, 1280, 0));
 __m512i test_mm512_lzcnt_epi32(__m512i __A) {
   // CHECK-LABEL: test_mm512_lzcnt_epi32
   // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 true)
diff --git a/clang/test/CodeGen/X86/avx512vlcd-builtins.c b/clang/test/CodeGen/X86/avx512vlcd-builtins.c
index 56c04a08c6322..29fc6fd2e7fc8 100644
--- a/clang/test/CodeGen/X86/avx512vlcd-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlcd-builtins.c
@@ -66,83 +66,114 @@ TEST_CONSTEXPR(match_v8si(_mm256_broadcastmw_epi32((__mmask16)(0xcafe)), 0xcafe,
 __m128i test_mm_conflict_epi64(__m128i __A) {
   // CHECK-LABEL: test_mm_conflict_epi64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %{{.*}})
-  return _mm_conflict_epi64(__A); 
+  return _mm_conflict_epi64(__A);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_conflict_epi64((__m128i)(__v2di){1, 2}), 0, 0));
+TEST_CONSTEXPR(match_v2di(_mm_conflict_epi64((__m128i)(__v2di){5, 5}), 0, 1));
+
 __m128i test_mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_conflict_epi64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_mask_conflict_epi64(__W, __U, __A); 
+  return _mm_mask_conflict_epi64(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_mask_conflict_epi64((__m128i)(__v2di){0xFF, 0xFF}, 0x2, (__m128i)(__v2di){5, 5}), 0xFF, 1));
+
 __m128i test_mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_conflict_epi64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_maskz_conflict_epi64(__U, __A); 
+  return _mm_maskz_conflict_epi64(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_maskz_conflict_epi64(0x2, (__m128i)(__v2di){5, 5}), 0, 1));
+
 __m256i test_mm256_conflict_epi64(__m256i __A) {
   // CHECK-LABEL: test_mm256_conflict_epi64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %{{.*}})
-  return _mm256_conflict_epi64(__A); 
+  return _mm256_conflict_epi64(__A);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_conflict_epi64((__m256i)(__v4di){1, 2, 1, 3}), 0, 0, 1, 0));
+TEST_CONSTEXPR(match_v4di(_mm256_conflict_epi64((__m256i)(__v4di){7, 7, 7, 7}), 0, 1, 3, 7));
+TEST_CONSTEXPR(match_v4di(_mm256_conflict_epi64((__m256i)(__v4di){1, 2, 3, 4}), 0, 0, 0, 0));
+
 __m256i test_mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_conflict_epi64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_mask_conflict_epi64(__W, __U, __A); 
+  return _mm256_mask_conflict_epi64(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_mask_conflict_epi64((__m256i)(__v4di){0xFF, 0xFF, 0xFF, 0xFF}, 0x5, (__m256i)(__v4di){1, 2, 1, 3}), 0, 0xFF, 1, 0xFF));
+
 __m256i test_mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_conflict_epi64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_maskz_conflict_epi64(__U, __A); 
+  return _mm256_maskz_conflict_epi64(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_conflict_epi64(0x5, (__m256i)(__v4di){1, 2, 1, 3}), 0, 0, 1, 0));
+
 __m128i test_mm_conflict_epi32(__m128i __A) {
   // CHECK-LABEL: test_mm_conflict_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %{{.*}})
-  return _mm_conflict_epi32(__A); 
+  return _mm_conflict_epi32(__A);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_conflict_epi32((__m128i)(__v4si){1, 2, 1, 3}), 0, 0, 1, 0));
+TEST_CONSTEXPR(match_v4si(_mm_conflict_epi32((__m128i)(__v4si){3, 3, 3, 3}), 0, 1, 3, 7));
+TEST_CONSTEXPR(match_v4si(_mm_conflict_epi32((__m128i)(__v4si){1, 2, 3, 4}), 0, 0, 0, 0));
+
 __m128i test_mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_conflict_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_mask_conflict_epi32(__W, __U, __A); 
+  return _mm_mask_conflict_epi32(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_conflict_epi32((__m128i)(__v4si){0xFF, 0xFF, 0xFF, 0xFF}, 0x5, (__m128i)(__v4si){1, 2, 1, 3}), 0, 0xFF, 1, 0xFF));
+
 __m128i test_mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_conflict_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_conflict_epi32(__U, __A); 
+  return _mm_maskz_conflict_epi32(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_conflict_epi32(0x5, (__m128i)(__v4si){1, 2, 1, 3}), 0, 0, 1, 0));
+
 __m256i test_mm256_conflict_epi32(__m256i __A) {
   // CHECK-LABEL: test_mm256_conflict_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %{{.*}})
-  return _mm256_conflict_epi32(__A); 
+  return _mm256_conflict_epi32(__A);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_conflict_epi32((__m256i)(__v8si){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
+TEST_CONSTEXPR(match_v8si(_mm256_conflict_epi32((__m256i)(__v8si){4, 4, 4, 4, 4, 4, 4, 4}), 0, 1, 3, 7, 15, 31, 63, 127));
+TEST_CONSTEXPR(match_v8si(_mm256_conflict_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m256i test_mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_conflict_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_mask_conflict_epi32(__W, __U, __A); 
+  return _mm256_mask_conflict_epi32(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_conflict_epi32((__m256i)(__v8si){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, /*0101 0101=*/0x55, (__m256i)(__v8si){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0xFF, 1, 0xFF, 2, 0xFF, 5, 0xFF));
+
 __m256i test_mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_conflict_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_conflict_epi32(__U, __A); 
+  return _mm256_maskz_conflict_epi32(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_conflict_epi32(0x55, (__m256i)(__v8si){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
+
 __m128i test_mm_lzcnt_epi32(__m128i __A) {
   // CHECK-LABEL: test_mm_lzcnt_epi32
   // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 true)
diff --git a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
index 4fb977a5367e7..e40b2d7ae43ea 100644
--- a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
+++ b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=ARM-DARWIN
 // RUN: %clang_cc1 -triple arm-unknown-gnueabi -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=ARM-EABI
 // RUN: %clang_cc1 -triple mipsel-unknown-unknown -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=MIPS
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -fcxx-exceptions -fexceptions -exception-model=seh %s -O2 -o - | FileCheck %s --check-prefix=MINGW-X86-64
+// RUN: %clang_cc1 -triple thumbv7-windows-gnu -emit-llvm -fcxx-exceptions -fexceptions -exception-model=seh %s -O2 -o - | FileCheck %s --check-prefix=MINGW-ARMV7
 
 void foo();
 void test() {
@@ -25,9 +27,15 @@ void test() {
 // ARM-EABI-NEXT:   [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 88
 // MIPS:            [[T0:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // MIPS-NEXT:       [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 24
+// MINGW-X86-64:     [[T0:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MINGW-X86-64-NEXT:[[T1:%.*]] = getelementptr i8, ptr [[EXN]], i64 64
+// MINGW-ARMV7:      [[T0:%.*]] = tail call arm_aapcs_vfpcc ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MINGW-ARMV7-NEXT: [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 48
 
 // X86-64: attributes [[NUW]] = { nounwind }
 // X86-32: attributes [[NUW]] = { nounwind }
 // ARM-DARWIN: attributes [[NUW]] = { nounwind }
 // ARM-EABI: attributes [[NUW]] = { nounwind }
 // MIPS: attributes [[NUW]] = { nounwind }
+// MINGW-X86-64: attributes [[NUW]] = { nounwind }
+// MINGW-ARMV7: attributes [[NUW]] = { nounwind }
diff --git a/clang/test/DebugInfo/CXX/versioned-language.cpp b/clang/test/DebugInfo/CXX/versioned-language.cpp
new file mode 100644
index 0000000000000..4cb2b29086035
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/versioned-language.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 -std=c++98 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++98 | FileCheck %s --check-prefix=CHECK-CPP98
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++03 | FileCheck %s --check-prefix=CHECK-CPP03
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++11 | FileCheck %s --check-prefix=CHECK-CPP11
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++14 | FileCheck %s --check-prefix=CHECK-CPP14
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++17 | FileCheck %s --check-prefix=CHECK-CPP17
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++20 | FileCheck %s --check-prefix=CHECK-CPP20
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++23 | FileCheck %s --check-prefix=CHECK-CPP23
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++2c | FileCheck %s --check-prefix=CHECK-CPP2C
+
+struct Foo {} globalVar;
+
+// CHECK-CPP98:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 199711
+// FIXME: C++03 technically has no official standard version code. From Clang's point of view C++03 and C++98 are interchangable.
+// CHECK-CPP03:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 199711
+// CHECK-CPP11:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201103
+// CHECK-CPP14:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201402
+// CHECK-CPP17:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201703
+// CHECK-CPP20:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202002
+// CHECK-CPP23:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202302
+// CHECK-CPP2C:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202400
diff --git a/clang/test/DebugInfo/Generic/versioned-language.c b/clang/test/DebugInfo/Generic/versioned-language.c
new file mode 100644
index 0000000000000..1faa7b4b56d4e
--- /dev/null
+++ b/clang/test/DebugInfo/Generic/versioned-language.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 -std=c99 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c89 | FileCheck %s --check-prefix=CHECK-C89 --implicit-check-not "sourceLanguageVersion"
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c99 | FileCheck %s --check-prefix=CHECK-C99
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c17 | FileCheck %s --check-prefix=CHECK-C17
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c23 | FileCheck %s --check-prefix=CHECK-C23
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c2y | FileCheck %s --check-prefix=CHECK-C2Y
+
+int globalVar = 10;
+
+// CHECK-C89: !DICompileUnit(sourceLanguageName: DW_LNAME_C,
+// CHECK-C99: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 199901
+// CHECK-C11: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 201112
+// CHECK-C17: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 201710
+// CHECK-C23: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 202311
+// CHECK-C2Y: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 202400
diff --git a/clang/test/DebugInfo/ObjC/versioned-language.m b/clang/test/DebugInfo/ObjC/versioned-language.m
new file mode 100644
index 0000000000000..178c47bf8c841
--- /dev/null
+++ b/clang/test/DebugInfo/ObjC/versioned-language.m
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageVersion" --check-prefix=CHECK-OBJC
+
+int globalVar = 10;
+
+// CHECK-OBJC: !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC,
diff --git a/clang/test/DebugInfo/ObjCXX/versioned-language.mm b/clang/test/DebugInfo/ObjCXX/versioned-language.mm
new file mode 100644
index 0000000000000..bfdce462b2bf1
--- /dev/null
+++ b/clang/test/DebugInfo/ObjCXX/versioned-language.mm
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageVersion" --check-prefix=CHECK-OBJCXX
+
+int globalVar = 10;
+
+// CHECK-OBJCXX: !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus,
diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt
index 3336ac935e101..68af52f1b8dc7 100644
--- a/flang/include/flang/Optimizer/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(CodeGen)
 add_subdirectory(Dialect)
 add_subdirectory(HLFIR)
 add_subdirectory(Transforms)
+add_subdirectory(OpenACC)
 add_subdirectory(OpenMP)
diff --git a/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
new file mode 100644
index 0000000000000..a032488569b19
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name FIROpenACC)
+
+add_public_tablegen_target(FIROpenACCPassesIncGen)
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
new file mode 100644
index 0000000000000..0627cc8ce4a6d
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -0,0 +1,33 @@
+//===- Passes.h - OpenACC pass entry points -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header declares the OpenACC passes specific to Fortran and FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include <memory>
+
+namespace fir {
+namespace acc {
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+
+std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES_H
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
new file mode 100644
index 0000000000000..3c127b30aa9b8
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -0,0 +1,36 @@
+//===-- Passes.td - flang OpenACC pass definitions -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def ACCRecipeBufferization
+    : Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> {
+  let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses";
+  let description = [{
+    Bufferizes OpenACC recipes that operate on fir.box<T> so their type and
+    region block arguments become fir.ref<fir.box<T>> instead. This applies to
+    acc.private.recipe, acc.firstprivate.recipe (including copy region), and
+    acc.reduction.recipe (including combiner region).
+
+    For affected regions, the pass inserts required loads at the beginning of
+    the region to preserve original uses after argument type changes. For yields
+    of box values, the pass allocates a local fir.ref<fir.box<T>> and stores the
+    yielded fir.box<T> into it so the region yields a reference to a box.
+
+    For acc.private, acc.firstprivate, and acc.reduction operations that use a
+    bufferized recipe, the pass allocates a host-side fir.ref<fir.box<T>> before
+    the data op and rewires the data op to use the new memory. Other users of
+    the original data operation result (outside the paired compute op) are
+    updated to load through the reference.
+  }];
+}
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 2954a1c4769f7..0f851830edd46 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -38,6 +38,7 @@ template <typename T, typename U = std::remove_const_t<T>> U AsRvalue(T &t) {
 template <typename T> T &&AsRvalue(T &&t) { return std::move(t); }
 
 const Scope &GetScopingUnit(const Scope &scope);
+const Scope &GetProgramUnit(const Scope &scope);
 
 // There is no consistent way to get the source of an ActionStmt, but there
 // is "source" in Statement<T>. This structure keeps the ActionStmt with the
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index fc23e64eeb7a4..790b9fdb1589a 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(Support)
+add_subdirectory(Transforms)
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
new file mode 100644
index 0000000000000..4840a999ecd27
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -0,0 +1,191 @@
+//===- ACCRecipeBufferization.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bufferize OpenACC recipes that yield fir.box<T> to operate on
+// fir.ref<fir.box<T>> and update uses accordingly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace fir::acc {
+#define GEN_PASS_DEF_ACCRECIPEBUFFERIZATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace fir::acc
+
+namespace {
+
+class BufferizeInterface {
+public:
+  static std::optional<mlir::Type> mustBufferize(mlir::Type recipeType) {
+    if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(recipeType))
+      return fir::ReferenceType::get(boxTy);
+    return std::nullopt;
+  }
+
+  static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
+                               mlir::Value value) {
+    return builder.create<fir::LoadOp>(loc, value);
+  }
+
+  static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
+                                   mlir::Value value) {
+    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
+    builder.create<fir::StoreOp>(loc, value, alloca);
+    return alloca;
+  }
+};
+
+static void bufferizeRegionArgsAndYields(mlir::Region &region,
+                                         mlir::Location loc, mlir::Type oldType,
+                                         mlir::Type newType) {
+  if (region.empty())
+    return;
+
+  mlir::OpBuilder builder(&region);
+  for (mlir::BlockArgument arg : region.getArguments()) {
+    if (arg.getType() == oldType) {
+      arg.setType(newType);
+      if (!arg.use_empty()) {
+        mlir::Operation *loadOp = BufferizeInterface::load(builder, loc, arg);
+        arg.replaceAllUsesExcept(loadOp->getResult(0), loadOp);
+      }
+    }
+  }
+  if (auto yield =
+          llvm::dyn_cast<mlir::acc::YieldOp>(region.back().getTerminator())) {
+    llvm::SmallVector<mlir::Value> newOperands;
+    newOperands.reserve(yield.getNumOperands());
+    bool changed = false;
+    for (mlir::Value oldYieldArg : yield.getOperands()) {
+      if (oldYieldArg.getType() == oldType) {
+        builder.setInsertionPoint(yield);
+        mlir::Value alloca =
+            BufferizeInterface::placeInMemory(builder, loc, oldYieldArg);
+        newOperands.push_back(alloca);
+        changed = true;
+      } else {
+        newOperands.push_back(oldYieldArg);
+      }
+    }
+    if (changed)
+      yield->setOperands(newOperands);
+  }
+}
+
+static void updateRecipeUse(mlir::ArrayAttr recipes, mlir::ValueRange operands,
+                            llvm::StringRef recipeSymName,
+                            mlir::Operation *computeOp) {
+  if (!recipes)
+    return;
+  for (auto [recipeSym, oldRes] : llvm::zip(recipes, operands)) {
+    if (llvm::cast<mlir::SymbolRefAttr>(recipeSym).getLeafReference() !=
+        recipeSymName)
+      continue;
+
+    mlir::Operation *dataOp = oldRes.getDefiningOp();
+    assert(dataOp && "dataOp must be paired with computeOp");
+    mlir::Location loc = dataOp->getLoc();
+    mlir::OpBuilder builder(dataOp);
+    llvm::TypeSwitch<mlir::Operation *, void>(dataOp)
+        .Case<mlir::acc::PrivateOp, mlir::acc::FirstprivateOp,
+              mlir::acc::ReductionOp>([&](auto privateOp) {
+          builder.setInsertionPointAfterValue(privateOp.getVar());
+          mlir::Value alloca = BufferizeInterface::placeInMemory(
+              builder, loc, privateOp.getVar());
+          privateOp.getVarMutable().assign(alloca);
+          privateOp.getAccVar().setType(alloca.getType());
+        });
+
+    llvm::SmallVector<mlir::Operation *> users(oldRes.getUsers().begin(),
+                                               oldRes.getUsers().end());
+    for (mlir::Operation *useOp : users) {
+      if (useOp == computeOp)
+        continue;
+      builder.setInsertionPoint(useOp);
+      mlir::Operation *load = BufferizeInterface::load(builder, loc, oldRes);
+      useOp->replaceUsesOfWith(oldRes, load->getResult(0));
+    }
+  }
+}
+
+class ACCRecipeBufferization
+    : public fir::acc::impl::ACCRecipeBufferizationBase<
+          ACCRecipeBufferization> {
+public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    llvm::SmallVector<llvm::StringRef> recipeNames;
+    module.walk([&](mlir::Operation *recipe) {
+      llvm::TypeSwitch<mlir::Operation *, void>(recipe)
+          .Case<mlir::acc::PrivateRecipeOp, mlir::acc::FirstprivateRecipeOp,
+                mlir::acc::ReductionRecipeOp>([&](auto recipe) {
+            mlir::Type oldType = recipe.getType();
+            auto bufferizedType =
+                BufferizeInterface::mustBufferize(recipe.getType());
+            if (!bufferizedType)
+              return;
+            recipe.setTypeAttr(mlir::TypeAttr::get(*bufferizedType));
+            mlir::Location loc = recipe.getLoc();
+            using RecipeOp = decltype(recipe);
+            bufferizeRegionArgsAndYields(recipe.getInitRegion(), loc, oldType,
+                                         *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::FirstprivateRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCopyRegion(), loc, oldType,
+                                           *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::ReductionRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCombinerRegion(), loc,
+                                           oldType, *bufferizedType);
+            bufferizeRegionArgsAndYields(recipe.getDestroyRegion(), loc,
+                                         oldType, *bufferizedType);
+            recipeNames.push_back(recipe.getSymName());
+          });
+    });
+    if (recipeNames.empty())
+      return;
+
+    module.walk([&](mlir::Operation *op) {
+      llvm::TypeSwitch<mlir::Operation *, void>(op)
+          .Case<mlir::acc::LoopOp, mlir::acc::ParallelOp, mlir::acc::SerialOp>(
+              [&](auto computeOp) {
+                for (llvm::StringRef recipeName : recipeNames) {
+                  if (computeOp.getPrivatizationRecipes())
+                    updateRecipeUse(computeOp.getPrivatizationRecipesAttr(),
+                                    computeOp.getPrivateOperands(), recipeName,
+                                    op);
+                  if (computeOp.getFirstprivatizationRecipes())
+                    updateRecipeUse(
+                        computeOp.getFirstprivatizationRecipesAttr(),
+                        computeOp.getFirstprivateOperands(), recipeName, op);
+                  if (computeOp.getReductionRecipes())
+                    updateRecipeUse(computeOp.getReductionRecipesAttr(),
+                                    computeOp.getReductionOperands(),
+                                    recipeName, op);
+                }
+              });
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::acc::createACCRecipeBufferizationPass() {
+  return std::make_unique<ACCRecipeBufferization>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..2427da03e1a3c
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_flang_library(FIROpenACCTransforms
+  ACCRecipeBufferization.cpp
+
+  DEPENDS
+  FIROpenACCPassesIncGen
+
+  LINK_LIBS
+  MLIRIR
+  MLIRPass
+  FIRDialect
+  MLIROpenACCDialect
+)
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index d65a89e768466..17019d99636af 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3017,8 +3017,8 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) {
                                                         &objs,
                                                     std::string clause) {
             for (const auto &obj : objs.v) {
-              if (const parser::Name *
-                  objName{parser::Unwrap<parser::Name>(obj)}) {
+              if (const parser::Name *objName{
+                      parser::Unwrap<parser::Name>(obj)}) {
                 if (&objName->symbol->GetUltimate() == eventHandleSym) {
                   context_.Say(GetContext().clauseSource,
                       "A variable: `%s` that appears in a DETACH clause cannot appear on %s clause on the same construct"_err_en_US,
@@ -3637,7 +3637,8 @@ void OmpStructureChecker::CheckReductionModifier(
   if (modifier.v == ReductionModifier::Value::Task) {
     // "Task" is only allowed on worksharing or "parallel" directive.
     static llvm::omp::Directive worksharing[]{
-        llvm::omp::Directive::OMPD_do, llvm::omp::Directive::OMPD_scope,
+        llvm::omp::Directive::OMPD_do, //
+        llvm::omp::Directive::OMPD_scope, //
         llvm::omp::Directive::OMPD_sections,
         // There are more worksharing directives, but they do not apply:
         // "for" is C++ only,
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index a8ec4d6c24beb..292e73b4899c0 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -13,6 +13,7 @@
 #include "flang/Semantics/openmp-utils.h"
 
 #include "flang/Common/Fortran-consts.h"
+#include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
 #include "flang/Common/reference.h"
 #include "flang/Common/visit.h"
@@ -59,6 +60,26 @@ const Scope &GetScopingUnit(const Scope &scope) {
   return *iter;
 }
 
+const Scope &GetProgramUnit(const Scope &scope) {
+  const Scope *unit{nullptr};
+  for (const Scope *iter{&scope}; !iter->IsTopLevel(); iter = &iter->parent()) {
+    switch (iter->kind()) {
+    case Scope::Kind::BlockData:
+    case Scope::Kind::MainProgram:
+    case Scope::Kind::Module:
+      return *iter;
+    case Scope::Kind::Subprogram:
+      // Ignore subprograms that are nested.
+      unit = iter;
+      break;
+    default:
+      break;
+    }
+  }
+  assert(unit && "Scope not in a program unit");
+  return *unit;
+}
+
 SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x) {
   if (x == nullptr) {
     return SourcedActionStmt{};
@@ -202,7 +223,7 @@ std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
   // ForwardOwningPointer           typedExpr
   // `- GenericExprWrapper          ^.get()
   //    `- std::optional<Expr>      ^->v
-  return typedExpr.get()->v;
+  return DEREF(typedExpr.get()).v;
 }
 
 std::optional<evaluate::DynamicType> GetDynamicType(
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 18fc63814d973..de680b41d1524 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -3549,40 +3549,38 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
 void OmpAttributeVisitor::AddOmpRequiresToScope(Scope &scope,
     WithOmpDeclarative::RequiresFlags flags,
     std::optional<common::OmpMemoryOrderType> memOrder) {
-  Scope *scopeIter = &scope;
-  do {
-    if (Symbol * symbol{scopeIter->symbol()}) {
-      common::visit(
-          [&](auto &details) {
-            // Store clauses information into the symbol for the parent and
-            // enclosing modules, programs, functions and subroutines.
-            if constexpr (std::is_convertible_v<decltype(&details),
-                              WithOmpDeclarative *>) {
-              if (flags.any()) {
-                if (const WithOmpDeclarative::RequiresFlags *
-                    otherFlags{details.ompRequires()}) {
-                  flags |= *otherFlags;
-                }
-                details.set_ompRequires(flags);
+  const Scope &programUnit{omp::GetProgramUnit(scope)};
+
+  if (auto *symbol{const_cast<Symbol *>(programUnit.symbol())}) {
+    common::visit(
+        [&](auto &details) {
+          // Store clauses information into the symbol for the parent and
+          // enclosing modules, programs, functions and subroutines.
+          if constexpr (std::is_convertible_v<decltype(&details),
+                            WithOmpDeclarative *>) {
+            if (flags.any()) {
+              if (const WithOmpDeclarative::RequiresFlags *otherFlags{
+                      details.ompRequires()}) {
+                flags |= *otherFlags;
               }
-              if (memOrder) {
-                if (details.has_ompAtomicDefaultMemOrder() &&
-                    *details.ompAtomicDefaultMemOrder() != *memOrder) {
-                  context_.Say(scopeIter->sourceRange(),
-                      "Conflicting '%s' REQUIRES clauses found in compilation "
-                      "unit"_err_en_US,
-                      parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(
-                          llvm::omp::Clause::OMPC_atomic_default_mem_order)
-                                                     .str()));
-                }
-                details.set_ompAtomicDefaultMemOrder(*memOrder);
+              details.set_ompRequires(flags);
+            }
+            if (memOrder) {
+              if (details.has_ompAtomicDefaultMemOrder() &&
+                  *details.ompAtomicDefaultMemOrder() != *memOrder) {
+                context_.Say(programUnit.sourceRange(),
+                    "Conflicting '%s' REQUIRES clauses found in compilation "
+                    "unit"_err_en_US,
+                    parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(
+                        llvm::omp::Clause::OMPC_atomic_default_mem_order)
+                            .str()));
               }
+              details.set_ompAtomicDefaultMemOrder(*memOrder);
             }
-          },
-          symbol->details());
-    }
-    scopeIter = &scopeIter->parent();
-  } while (!scopeIter->IsGlobal());
+          }
+        },
+        symbol->details());
+  }
 }
 
 void OmpAttributeVisitor::IssueNonConformanceWarning(llvm::omp::Directive D,
diff --git a/flang/test/Fir/OpenACC/recipe-bufferization.mlir b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
new file mode 100644
index 0000000000000..c4f96f63d5076
--- /dev/null
+++ b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
@@ -0,0 +1,316 @@
+// RUN: fir-opt %s --fir-acc-recipe-bufferization -split-input-file | FileCheck %s
+
+// -----
+
+acc.private.recipe @priv_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.allocmem i32
+  %2 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  %0 = fir.box_addr %arg1 : (!fir.box<i32>) -> !fir.ref<i32>
+  %1 = fir.convert %0 : (!fir.ref<i32>) -> !fir.heap<i32>
+  fir.freemem %1 : !fir.heap<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[DARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[DARG1:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LD1:.*]] = fir.load %[[DARG1]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LD1]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[CVT:.*]] = fir.convert %[[ADDR]] : (!fir.ref<i32>) -> !fir.heap<i32>
+
+// -----
+
+// Test private recipe without destroy region.
+
+acc.private.recipe @priv_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: }
+
+// -----
+
+// Firstprivate recipe with destroy region.
+acc.firstprivate.recipe @fp_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP]] to %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.box<i32>>, %[[DST:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST:.*]] = fir.load %[[DST]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR:.*]] = fir.box_addr %[[LSRC]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL:.*]] = fir.load %[[SADDR]] : !fir.ref<i32>
+// CHECK:   %[[DADDR:.*]] = fir.box_addr %[[LDST]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL]] to %[[DADDR]] : !fir.ref<i32>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[FDARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[FDARG1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Firstprivate recipe without destroy region.
+acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP2:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP2]] to %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC2:.*]]: !fir.ref<!fir.box<i32>>, %[[DST2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC2:.*]] = fir.load %[[SRC2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST2:.*]] = fir.load %[[DST2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR2:.*]] = fir.box_addr %[[LSRC2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL2:.*]] = fir.load %[[SADDR2]] : !fir.ref<i32>
+// CHECK:   %[[DADDR2:.*]] = fir.box_addr %[[LDST2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL2]] to %[[DADDR2]] : !fir.ref<i32>
+
+// -----
+
+// Reduction recipe with destroy region.
+acc.reduction.recipe @red_ref_box : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR]] to %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS:.*]] = fir.load %[[LHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS:.*]] = fir.load %[[RHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR:.*]] = fir.box_addr %[[LLHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL:.*]] = fir.load %[[LADDR]] : !fir.ref<i32>
+// CHECK:   %[[RADDR:.*]] = fir.box_addr %[[LRHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL:.*]] = fir.load %[[RADDR]] : !fir.ref<i32>
+// CHECK:   %[[SUM:.*]] = arith.addi %[[LVAL]], %[[RVAL]] : i32
+// CHECK:   %[[I32ALLOCA:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM]] to %[[I32ALLOCA]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX:.*]] = fir.embox %[[I32ALLOCA]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX]] to %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[RD0:.*]]: !fir.ref<!fir.box<i32>>, %[[RD1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Reduction recipe without destroy region.
+acc.reduction.recipe @red_ref_box_no_destroy : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box_no_destroy : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR2:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR2]] to %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS2:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS2:.*]] = fir.load %[[LHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS2:.*]] = fir.load %[[RHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR2:.*]] = fir.box_addr %[[LLHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL2:.*]] = fir.load %[[LADDR2]] : !fir.ref<i32>
+// CHECK:   %[[RADDR2:.*]] = fir.box_addr %[[LRHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL2:.*]] = fir.load %[[RADDR2]] : !fir.ref<i32>
+// CHECK:   %[[SUM2:.*]] = arith.addi %[[LVAL2]], %[[RVAL2]] : i32
+// CHECK:   %[[I32ALLOCA2:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM2]] to %[[I32ALLOCA2]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX2:.*]] = fir.embox %[[I32ALLOCA2]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX2]] to %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+
+// -----
+
+// Comprehensive tests that also test recipe usages updates.
+
+acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.declare %0 {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  acc.yield %1 : !fir.ref<i32>
+}
+acc.private.recipe @privatization_box_Uxf32 : !fir.box<!fir.array<?xf32>> init {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>):
+  %c0 = arith.constant 0 : index
+  %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  %1 = fir.shape %0#1 : (index) -> !fir.shape<1>
+  %2 = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
+  %3 = fir.declare %2(%1) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+  %4 = fir.embox %3(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+  acc.yield %4 : !fir.box<!fir.array<?xf32>>
+} destroy {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?xf32>>):
+  %0 = fir.box_addr %arg1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+  %1 = fir.convert %0 : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+  fir.freemem %1 : !fir.heap<!fir.array<?xf32>>
+  acc.terminator
+}
+func.func @_QPfoo(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+  %c200_i32 = arith.constant 200 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  acc.parallel combined(loop) {
+    %4 = acc.private var(%3 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {name = "x"}
+    %5 = acc.private varPtr(%2 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+    acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %4 : !fir.box<!fir.array<?xf32>>, @privatization_ref_i32 -> %5 : !fir.ref<i32>) control(%arg1 : i32) = (%c1_i32 : i32) to (%c200_i32 : i32)  step (%c1_i32 : i32) {
+      %6 = fir.dummy_scope : !fir.dscope
+      %7 = fir.declare %4 dummy_scope %6 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+      %8 = fir.declare %5 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      %9 = fir.convert %arg1 : (i32) -> f32
+      %10 = fir.convert %arg1 : (i32) -> i64
+      %11 = fir.array_coor %7 %10 : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      fir.store %9 to %11 : !fir.ref<f32>
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL:   acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+// CHECK:           %[[VAL_1:.*]] = fir.alloca i32
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           acc.yield %[[VAL_2]] : !fir.ref<i32>
+// CHECK:         }
+
+// CHECK-LABEL:   acc.private.recipe @privatization_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = fir.allocmem !fir.array<?xf32>, %[[VAL_3]]#1 {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:           %[[VAL_6:.*]] = fir.declare %[[VAL_5]](%[[VAL_4]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]](%[[VAL_4]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.yield %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+
+// CHECK-LABEL:   } destroy {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           fir.freemem %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+// CHECK:           acc.terminator
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @_QPfoo(
+// CHECK-SAME:                      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 200 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_3]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_5]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.parallel combined(loop) {
+// CHECK:             %[[VAL_7:.*]] = acc.private varPtr(%[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>> {name = "x"}
+// CHECK:             %[[VAL_8:.*]] = acc.private varPtr(%[[VAL_4]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+// CHECK:             acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, @privatization_ref_i32 -> %[[VAL_8]] : !fir.ref<i32>) control(%[[VAL_9:.*]] : i32) = (%[[VAL_1]] : i32) to (%[[VAL_0]] : i32)  step (%[[VAL_1]] : i32) {
+// CHECK:               %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:               %[[VAL_12:.*]] = fir.declare %[[VAL_11]] dummy_scope %[[VAL_10]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:               %[[VAL_13:.*]] = fir.declare %[[VAL_8]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
+// CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:               %[[VAL_16:.*]] = fir.array_coor %[[VAL_12]] %[[VAL_15]] : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:               fir.store %[[VAL_14]] to %[[VAL_16]] : !fir.ref<f32>
+// CHECK:               acc.yield
+// CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+// CHECK:             acc.yield
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt
index 4ee9752727b87..c5bd4390a4b78 100644
--- a/flang/tools/fir-opt/CMakeLists.txt
+++ b/flang/tools/fir-opt/CMakeLists.txt
@@ -22,6 +22,7 @@ target_link_libraries(fir-opt PRIVATE
   HLFIRDialect
   HLFIRTransforms
   FIROpenACCSupport
+  FIROpenACCTransforms
   FIROpenMPSupport
   FlangOpenMPTransforms
   FIRAnalysis
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index d66fc3f08bdf8..b0b277b88dbe2 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Transforms/Passes.h"
@@ -37,6 +38,7 @@ int main(int argc, char **argv) {
   fir::registerOptTransformPasses();
   hlfir::registerHLFIRPasses();
   flangomp::registerFlangOpenMPPasses();
+  fir::acc::registerFIROpenACCPasses();
 #ifdef FLANG_INCLUDE_TESTS
   fir::test::registerTestFIRAliasAnalysisPass();
   fir::test::registerTestFIROpenACCInterfacesPass();
diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index ffb4fe6487fdc..796b1d8ed1398 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -36,7 +36,7 @@
   },
   "math": {
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
-      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT)"
+      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES)"
     }
   },
   "general": {
diff --git a/libcxx/include/__cxx03/__bit_reference b/libcxx/include/__cxx03/__bit_reference
index 76027e2d1523f..ac0005ff00f13 100644
--- a/libcxx/include/__cxx03/__bit_reference
+++ b/libcxx/include/__cxx03/__bit_reference
@@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
       unsigned __clz       = __bits_per_word - __first.__ctz_;
       difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz - __dn));
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -185,7 +185,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
     // do last word
     if (__n > 0) {
       __first.__seg_ += __nw;
-      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __m = __storage_type(~0) >> (__bits_per_word - __n);
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -210,11 +210,11 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
       unsigned __clz_f     = __bits_per_word - __first.__ctz_;
       difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __m   = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz_f - __dn));
       __storage_type __b   = *__first.__seg_ & __m;
       unsigned __clz_r     = __bits_per_word - __result.__ctz_;
       __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
-      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      __m                  = (__storage_type(~0) << __result.__ctz_) & (__storage_type(~0) >> (__clz_r - __ddn));
       *__result.__seg_ &= ~__m;
       if (__result.__ctz_ > __first.__ctz_)
         *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
@@ -224,7 +224,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
       __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
       __dn -= __ddn;
       if (__dn > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        __m = __storage_type(~0) >> (__bits_per_word - __dn);
         *__result.__seg_ &= ~__m;
         *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
         __result.__ctz_ = static_cast<unsigned>(__dn);
@@ -235,7 +235,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
     // __first.__ctz_ == 0;
     // do middle words
     unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    __storage_type __m = __storage_type(~0) << __result.__ctz_;
     for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
       __storage_type __b = *__first.__seg_;
       *__result.__seg_ &= ~__m;
@@ -246,17 +246,17 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
     }
     // do last word
     if (__n > 0) {
-      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __m                 = __storage_type(~0) >> (__bits_per_word - __n);
       __storage_type __b  = *__first.__seg_ & __m;
       __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
-      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      __m                 = (__storage_type(~0) << __result.__ctz_) & (__storage_type(~0) >> (__clz_r - __dn));
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b << __result.__ctz_;
       __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
       __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
       __n -= __dn;
       if (__n > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        __m = __storage_type(~0) >> (__bits_per_word - __n);
         *__result.__seg_ &= ~__m;
         *__result.__seg_ |= __b >> __dn;
         __result.__ctz_ = static_cast<unsigned>(__n);
diff --git a/libcxx/include/__cxx03/__verbose_abort b/libcxx/include/__cxx03/__verbose_abort
index 4fcfffa2b4dfa..52d1297b65593 100644
--- a/libcxx/include/__cxx03/__verbose_abort
+++ b/libcxx/include/__cxx03/__verbose_abort
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // This function should never be called directly from the code -- it should only be called through
 // the _LIBCPP_VERBOSE_ABORT macro.
 _LIBCPP_NORETURN _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
-_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
+_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _NOEXCEPT;
 
 // _LIBCPP_VERBOSE_ABORT(format, args...)
 //
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 74923ddb74e9c..6b65e738fef3b 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -808,7 +808,7 @@ public:
           }
           {
             __node_holder __h = __construct_node_hash(__hash, std::forward<_Args>(__args2)...);
-            if (size() + 1 > __bc * max_load_factor() || __bc == 0) {
+            if (size() + 1 > __bc * max_load_factor()) {
               __rehash_unique(std::max<size_type>(2 * __bc + !std::__is_hash_power2(__bc),
                                                   size_type(__math::ceil(float(size() + 1) / max_load_factor()))));
               __bc    = bucket_count();
diff --git a/libcxx/include/__utility/cmp.h b/libcxx/include/__utility/cmp.h
index 14dc0c154c040..68864e23e0397 100644
--- a/libcxx/include/__utility/cmp.h
+++ b/libcxx/include/__utility/cmp.h
@@ -26,10 +26,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
+template <typename _Tp, typename _Ip>
+concept __comparison_can_promote_to =
+    sizeof(_Tp) < sizeof(_Ip) || (sizeof(_Tp) == sizeof(_Ip) && __signed_integer<_Tp>);
+
 template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t == __u;
+  else if constexpr (__comparison_can_promote_to<_Tp, int> && __comparison_can_promote_to<_Up, int>)
+    return static_cast<int>(__t) == static_cast<int>(__u);
+  else if constexpr (__comparison_can_promote_to<_Tp, long long> && __comparison_can_promote_to<_Up, long long>)
+    return static_cast<long long>(__t) == static_cast<long long>(__u);
   else if constexpr (is_signed_v<_Tp>)
     return __t < 0 ? false : make_unsigned_t<_Tp>(__t) == __u;
   else
@@ -45,6 +53,10 @@ template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t < __u;
+  else if constexpr (__comparison_can_promote_to<_Tp, int> && __comparison_can_promote_to<_Up, int>)
+    return static_cast<int>(__t) < static_cast<int>(__u);
+  else if constexpr (__comparison_can_promote_to<_Tp, long long> && __comparison_can_promote_to<_Up, long long>)
+    return static_cast<long long>(__t) < static_cast<long long>(__u);
   else if constexpr (is_signed_v<_Tp>)
     return __t < 0 ? true : make_unsigned_t<_Tp>(__t) < __u;
   else
diff --git a/libcxx/test/benchmarks/utility/cmp.bench.cpp b/libcxx/test/benchmarks/utility/cmp.bench.cpp
new file mode 100644
index 0000000000000..1ed179ac4e38c
--- /dev/null
+++ b/libcxx/test/benchmarks/utility/cmp.bench.cpp
@@ -0,0 +1,139 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <utility>
+#include "../CartesianBenchmarks.h"
+#include "benchmark/benchmark.h"
+
+namespace {
+
+enum ValueType : size_t {
+  SChar,
+  UChar,
+  Short,
+  UShort,
+  Int,
+  UInt,
+  Long,
+  ULong,
+  LongLong,
+  ULongLong,
+#ifndef TEST_HAS_NO_INT128
+  Int128,
+  UInt128,
+#endif
+};
+
+struct AllValueTypes : EnumValuesAsTuple<AllValueTypes, ValueType, 6> {
+  static constexpr const char* Names[] = {
+      "schar",
+      "uchar",
+      "short",
+      "ushort",
+      "int",
+      "uint",
+      "long",
+      "ulong",
+      "longlong",
+      "ulonglong",
+#ifndef TEST_HAS_NO_INT128
+      "int128",
+      "uint128"
+#endif
+  };
+};
+
+using TestType =
+    std::tuple< signed char,
+                unsigned char,
+                short,
+                unsigned short,
+                int,
+                unsigned int,
+                long,
+                unsigned long,
+                long long,
+                unsigned long long
+#ifndef TEST_HAS_NO_INT128
+                ,
+                __int128_t,
+                __uint128_t
+#endif
+                >;
+
+template <typename TType, typename UType>
+struct CmpEqual {
+  static void run(benchmark::State& state) {
+    using T = std::tuple_element_t<TType::value, TestType>;
+    using U = std::tuple_element_t<UType::value, TestType>;
+
+    T x1 = T{127}, x2 = T{111};
+    U y1 = U{123}, y2 = U{1};
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(x1);
+      benchmark::DoNotOptimize(x2);
+      benchmark::DoNotOptimize(y1);
+      benchmark::DoNotOptimize(y2);
+      benchmark::DoNotOptimize(std::cmp_equal(x1, y1));
+      benchmark::DoNotOptimize(std::cmp_equal(y1, x1));
+      benchmark::DoNotOptimize(std::cmp_equal(x1, x1));
+      benchmark::DoNotOptimize(std::cmp_equal(y1, y1));
+
+      benchmark::DoNotOptimize(std::cmp_equal(x2, y2));
+      benchmark::DoNotOptimize(std::cmp_equal(y2, x2));
+      benchmark::DoNotOptimize(std::cmp_equal(x2, x2));
+      benchmark::DoNotOptimize(std::cmp_equal(y2, y2));
+    }
+  }
+
+  static std::string name() { return "BM_CmpEqual" + TType::name() + UType::name(); }
+};
+
+template <typename TType, typename UType>
+struct CmpLess {
+  static void run(benchmark::State& state) {
+    using T = std::tuple_element_t<TType::value, TestType>;
+    using U = std::tuple_element_t<UType::value, TestType>;
+
+    T x1 = T{127}, x2 = T{111};
+    U y1 = U{123}, y2 = U{1};
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(x1);
+      benchmark::DoNotOptimize(x2);
+      benchmark::DoNotOptimize(y1);
+      benchmark::DoNotOptimize(y2);
+      benchmark::DoNotOptimize(std::cmp_less(x1, y1));
+      benchmark::DoNotOptimize(std::cmp_less(y1, x1));
+      benchmark::DoNotOptimize(std::cmp_less(x1, x1));
+      benchmark::DoNotOptimize(std::cmp_less(y1, y1));
+
+      benchmark::DoNotOptimize(std::cmp_less(x2, y2));
+      benchmark::DoNotOptimize(std::cmp_less(y2, x2));
+      benchmark::DoNotOptimize(std::cmp_less(x2, x2));
+      benchmark::DoNotOptimize(std::cmp_less(y2, y2));
+    }
+  }
+
+  static std::string name() { return "BM_CmpLess" + TType::name() + UType::name(); }
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  makeCartesianProductBenchmark<CmpEqual, AllValueTypes, AllValueTypes>();
+  makeCartesianProductBenchmark<CmpLess, AllValueTypes, AllValueTypes>();
+  benchmark::RunSpecifiedBenchmarks();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
index 390c6b6db190d..3c7a2d45d4f01 100644
--- a/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
+++ b/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
@@ -12,9 +12,7 @@
 // failures when back-deploying.
 // XFAIL: availability-verbose_abort-missing
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
-#include <__verbose_abort>
+#include <__cxx03/__verbose_abort>
 #include <cstdlib>
 
 void std::__libcpp_verbose_abort(char const*, ...) _NOEXCEPT { std::exit(EXIT_SUCCESS); }
diff --git a/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 7ead65caf9fda..a9fe04fb0bcd5 100644
--- a/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,8 +21,6 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
 // RUN: %{build} -faligned-allocation -fno-sized-deallocation -DNO_SIZE
@@ -40,7 +38,7 @@
 
 TEST_DIAGNOSTIC_PUSH
 TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header")
-#include <__memory/aligned_alloc.h>
+#include <__cxx03/__memory/aligned_alloc.h>
 TEST_DIAGNOSTIC_POP
 
 struct alloc_stats {
@@ -138,42 +136,42 @@ void test_libcpp_dealloc() {
   std::size_t with_size_val   = 2;
 
   {
-    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), under_align_val);
+    std::__libcpp_deallocate_unsized(p, under_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 
 #if defined(NO_SIZE) && defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 #elif defined(NO_SIZE)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
 #elif defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
 #else
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_size_align(with_size_val, over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), over_align_val);
+    std::__libcpp_deallocate_unsized(p, over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), under_align_val);
+    std::__libcpp_deallocate(p, with_size_val, under_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index 3c9cf9bd61003..bede567f33019 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -12,8 +12,6 @@
 //   constexpr OutIter   // constexpr after C++17
 //   copy(InIter first, InIter last, OutIter result);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <vector>
diff --git a/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
index 69296dfa50121..d289ef63dfec1 100644
--- a/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
@@ -10,7 +10,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+// UNSUPPORTED: c++03
 
 #include <cstdalign>
 
diff --git a/lld/utils/benchmark.py b/lld/utils/benchmark.py
index 7202e07ec438d..059f82ac78d0c 100755
--- a/lld/utils/benchmark.py
+++ b/lld/utils/benchmark.py
@@ -13,13 +13,8 @@
 import json
 import datetime
 import argparse
-
-try:
-    from urllib.parse import urlencode
-    from urllib.request import urlopen, Request
-except ImportError:
-    from urllib import urlencode
-    from urllib2 import urlopen, Request
+from urllib.parse import urlencode
+from urllib.request import urlopen, Request
 
 
 parser = argparse.ArgumentParser()
diff --git a/lldb/examples/python/performance.py b/lldb/examples/python/performance.py
index b86b5a52522e0..c3181b61c84f7 100755
--- a/lldb/examples/python/performance.py
+++ b/lldb/examples/python/performance.py
@@ -16,7 +16,6 @@
 import sys
 import subprocess
 import time
-import types
 
 # ----------------------------------------------------------------------
 # Code that auto imports LLDB
@@ -121,19 +120,19 @@ def __init__(
             self.breakpoints.append(breakpoint)
         else:
             if module:
-                if isinstance(module, types.ListType):
+                if isinstance(module, list):
                     for module_path in module:
                         self.modules.Append(lldb.SBFileSpec(module_path, False))
-                elif isinstance(module, types.StringTypes):
+                elif isinstance(module, str):
                     self.modules.Append(lldb.SBFileSpec(module, False))
             if name:
                 # "file" can be a list or a string
                 if file:
-                    if isinstance(file, types.ListType):
+                    if isinstance(file, list):
                         self.files = lldb.SBFileSpecList()
                         for f in file:
                             self.files.Append(lldb.SBFileSpec(f, False))
-                    elif isinstance(file, types.StringTypes):
+                    elif isinstance(file, str):
                         self.files.Append(lldb.SBFileSpec(file, False))
                 self.breakpoints.append(
                     self.target.BreakpointCreateByName(name, self.modules, self.files)
diff --git a/lldb/examples/summaries/cocoa/CFString.py b/lldb/examples/summaries/cocoa/CFString.py
index 74bd927e9db21..02b670651cd53 100644
--- a/lldb/examples/summaries/cocoa/CFString.py
+++ b/lldb/examples/summaries/cocoa/CFString.py
@@ -11,11 +11,6 @@
 import lldb.runtime.objc.objc_runtime
 import lldb.formatters.Logger
 
-try:
-    unichr
-except NameError:
-    unichr = chr
-
 
 def CFString_SummaryProvider(valobj, dict):
     logger = lldb.formatters.Logger.Logger()
@@ -107,7 +102,7 @@ def read_unicode(self, pointer, max_len=2048):
                 value = b1 * 256 + b0
             else:
                 value = b0 * 256 + b1
-            pystr = pystr + unichr(value)
+            pystr = pystr + chr(value)
             # read max_len unicode values, not max_len bytes
             max_len = max_len - 1
         return pystr
diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
index ed028a1a4ea3f..4aea8009058b9 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
@@ -11,6 +11,9 @@ class TestCase(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/utils/lui/lldbutil.py b/lldb/utils/lui/lldbutil.py
index 140317af3670b..589acaeea3206 100644
--- a/lldb/utils/lui/lldbutil.py
+++ b/lldb/utils/lui/lldbutil.py
@@ -951,7 +951,7 @@ def get_GPRs(frame):
         from lldbutil import get_GPRs
         regs = get_GPRs(frame)
         for reg in regs:
-            print "%s => %s" % (reg.GetName(), reg.GetValue())
+            print("%s => %s" % (reg.GetName(), reg.GetValue()))
         ...
     """
     return get_registers(frame, "general purpose")
@@ -965,7 +965,7 @@ def get_FPRs(frame):
         from lldbutil import get_FPRs
         regs = get_FPRs(frame)
         for reg in regs:
-            print "%s => %s" % (reg.GetName(), reg.GetValue())
+            print("%s => %s" % (reg.GetName(), reg.GetValue()))
         ...
     """
     return get_registers(frame, "floating point")
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 402fd05b9e696..8193adcd97a19 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -488,21 +488,21 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
 
      **GCN GFX11 (RDNA 3.5)** [AMD-GCN-GFX11-RDNA3.5]_
      -----------------------------------------------------------------------------------------------------------------------
-     ``gfx1150``                 ``amdgcn``   APU   - cumode          - Architected                   *TBA*
+     ``gfx1150``                 ``amdgcn``   APU   - cumode          - Architected                   Radeon 890M
                                                     - wavefrontsize64   flat
                                                                         scratch                       .. TODO::
                                                                       - Packed
                                                                         work-item                       Add product
                                                                         IDs                             names.
 
-     ``gfx1151``                 ``amdgcn``   APU   - cumode          - Architected                   *TBA*
+     ``gfx1151``                 ``amdgcn``   APU   - cumode          - Architected                   Radeon 8060S
                                                     - wavefrontsize64   flat
                                                                         scratch                       .. TODO::
                                                                       - Packed
                                                                         work-item                       Add product
                                                                         IDs                             names.
 
-     ``gfx1152``                 ``amdgcn``   APU   - cumode          - Architected                   *TBA*
+     ``gfx1152``                 ``amdgcn``   APU   - cumode          - Architected                   Radeon 860M
                                                     - wavefrontsize64   flat
                                                                         scratch                       .. TODO::
                                                                       - Packed
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
new file mode 100644
index 0000000000000..644c4f614108e
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
@@ -0,0 +1,87 @@
+//===- SimpleRemoteMemoryMapper.h - Remote memory mapper --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple memory mapper that uses EPC calls to implement reserve, initialize,
+// deinitialize, and release.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+#define LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+
+#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+
+namespace llvm::orc {
+
+/// Manages remote memory by making SPS-based EPC calls.
+class LLVM_ABI SimpleRemoteMemoryMapper final : public MemoryMapper {
+public:
+  struct SymbolAddrs {
+    ExecutorAddr Instance;
+    ExecutorAddr Reserve;
+    ExecutorAddr Initialize;
+    ExecutorAddr Deinitialize;
+    ExecutorAddr Release;
+  };
+
+  SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC, SymbolAddrs SAs);
+
+  static Expected<std::unique_ptr<SimpleRemoteMemoryMapper>>
+  Create(ExecutorProcessControl &EPC, SymbolAddrs SAs) {
+    return std::make_unique<SimpleRemoteMemoryMapper>(EPC, SAs);
+  }
+
+  unsigned int getPageSize() override { return EPC.getPageSize(); }
+
+  /// Reserves memory in the remote process by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSExpected<SPSExecutorAddr>(uint64_t Size).
+  ///
+  /// On success, returns the base address of the reserved range.
+  void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
+
+  char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+                size_t ContentSize) override;
+
+  /// Initializes memory within a previously reserved region (applying
+  /// protections and running any finalization actions) by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSExpected<SPSExecutorAddr>(SPSFinalizeRequest)
+  ///
+  /// On success, returns a key that can be used to deinitialize the region.
+  void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
+
+  /// Given a series of keys from previous initialize calls, deinitialize
+  /// previously initialized memory regions (running dealloc actions, resetting
+  /// permissions and decommitting if possible) by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSError(SPSSequence<SPSExecutorAddr> Keys)
+  ///
+  void deinitialize(ArrayRef<ExecutorAddr> Allocations,
+                    OnDeinitializedFunction OnDeInitialized) override;
+
+  /// Given a sequence of base addresses from previous reserve calls, release
+  /// the underlying ranges (deinitializing any remaining regions within them)
+  /// by calling a remote SPS-wrapper-function with signature
+  ///
+  ///   SPSError(SPSSequence<SPSExecutorAddr> Bases)
+  ///
+  void release(ArrayRef<ExecutorAddr> Reservations,
+               OnReleasedFunction OnRelease) override;
+
+private:
+  ExecutorProcessControl &EPC;
+  SymbolAddrs SAs;
+};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 0ffe3ae37da28..f34392538a7cb 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT
   SectCreate.cpp
   SelfExecutorProcessControl.cpp
   SimpleRemoteEPC.cpp
+  SimpleRemoteMemoryMapper.cpp
   Speculation.cpp
   SpeculateAnalyses.cpp
   ExecutorProcessControl.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
new file mode 100644
index 0000000000000..b82de3fd15216
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
@@ -0,0 +1,104 @@
+//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+namespace llvm::orc {
+
+SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC,
+                                                   SymbolAddrs SAs)
+    : EPC(EPC), SAs(SAs) {}
+
+void SimpleRemoteMemoryMapper::reserve(size_t NumBytes,
+                                       OnReservedFunction OnReserved) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>(
+      SAs.Reserve,
+      [NumBytes, OnReserved = std::move(OnReserved)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnReserved(std::move(SerializationErr));
+        }
+
+        if (Result)
+          OnReserved(ExecutorAddrRange(*Result, NumBytes));
+        else
+          OnReserved(Result.takeError());
+      },
+      SAs.Instance, static_cast<uint64_t>(NumBytes));
+}
+
+char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G,
+                                        ExecutorAddr Addr, size_t ContentSize) {
+  return G.allocateBuffer(ContentSize).data();
+}
+
+void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+                                          OnInitializedFunction OnInitialized) {
+
+  tpctypes::FinalizeRequest FR;
+
+  std::swap(FR.Actions, AI.Actions);
+  FR.Segments.reserve(AI.Segments.size());
+
+  for (auto Seg : AI.Segments)
+    FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset,
+                           Seg.ContentSize + Seg.ZeroFillSize,
+                           ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)});
+
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>(
+      SAs.Initialize,
+      [OnInitialized = std::move(OnInitialized)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnInitialized(std::move(SerializationErr));
+        }
+
+        OnInitialized(std::move(Result));
+      },
+      SAs.Instance, std::move(FR));
+}
+
+void SimpleRemoteMemoryMapper::deinitialize(
+    ArrayRef<ExecutorAddr> Allocations,
+    MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>(
+      SAs.Deinitialize,
+      [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr,
+                                                     Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnDeinitialized(std::move(SerializationErr));
+        }
+
+        OnDeinitialized(std::move(Result));
+      },
+      SAs.Instance, Allocations);
+}
+
+void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+                                       OnReleasedFunction OnReleased) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>(
+      SAs.Release,
+      [OnReleased = std::move(OnReleased)](Error SerializationErr,
+                                           Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnReleased(std::move(SerializationErr));
+        }
+
+        return OnReleased(std::move(Result));
+      },
+      SAs.Instance, Bases);
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index be2f2e4cbbdb1..91c1f59c1d066 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMINIMUM:
     return LowerVECREDUCE(Op, DAG);
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_FMUL:
+    return LowerVECREDUCE_MUL(Op, DAG);
   case ISD::ATOMIC_LOAD_AND:
     return LowerATOMIC_LOAD_AND(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
@@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
   }
 }
 
+SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  assert(SrcVT.isScalableVector() && "Unexpected operand type!");
+
+  SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+  SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
+
+  // Whilst we don't know the size of the vector we do know the maximum size so
+  // can perform a tree reduction with an identity vector, which means once we
+  // arrive at the result the remaining stages (when the vector is smaller than
+  // the maximum) have no affect.
+
+  unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+  unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
+
+  for (unsigned I = 0; I < Stages; ++I) {
+    Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
+    Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
+  }
+
+  return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
+}
+
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..9495c9ffc47aa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -752,6 +752,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index be6239590e7da..e0375ea790566 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
     }
 
     if (Imm == AMDGPU::EncValues::LITERAL_CONST) {
-      Op = decodeLiteralConstant(
-          Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64);
+      Op = decodeLiteralConstant(Desc, OpDesc);
       continue;
     }
 
@@ -1545,21 +1544,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
 MCOperand
 AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
   if (HasLiteral) {
-    if (Literal64 != Val)
+    if (Literal != Val)
       return errOperand(Val, "More than one unique literal is illegal");
   }
   HasLiteral = true;
-  Literal = Literal64 = Val;
+  Literal = Val;
 
-  bool UseLit64 = Hi_32(Literal64) == 0;
+  bool UseLit64 = Hi_32(Literal) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Literal64, getContext()))
-                  : MCOperand::createImm(Literal64);
+                        LitModifier::Lit64, Literal, getContext()))
+                  : MCOperand::createImm(Literal);
 }
 
-MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
-                                                    const MCOperandInfo &OpDesc,
-                                                    bool ExtendFP64) const {
+MCOperand
+AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
+                                          const MCOperandInfo &OpDesc) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
@@ -1569,35 +1568,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
                         Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal = Literal64 = eatBytes<uint32_t>(Bytes);
-    if (ExtendFP64)
-      Literal64 <<= 32;
+    Literal = eatBytes<uint32_t>(Bytes);
   }
 
-  int64_t Val = ExtendFP64 ? Literal64 : Literal;
+  // For disassembling always assume all inline constants are available.
+  bool HasInv2Pi = true;
 
-  bool CanUse64BitLiterals =
-      STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-      !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
-
-  bool UseLit64 = false;
-  if (CanUse64BitLiterals) {
-    if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
-        OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64)
-      UseLit64 = false;
-    else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
-             OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-             OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64)
-      UseLit64 = Hi_32(Literal64) == 0;
+  // Invalid instruction codes may contain literals for inline-only
+  // operands, so we support them here as well.
+  int64_t Val = Literal;
+  bool UseLit = false;
+  switch (OpDesc.OperandType) {
+  default:
+    llvm_unreachable("Unexpected operand type!");
+  case AMDGPU::OPERAND_REG_IMM_BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+    UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2BF16:
+    UseLit = AMDGPU::isInlinableLiteralV2BF16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+    break;
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+    UseLit = AMDGPU::isInlinableLiteralV2I16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_KIMM32:
+    UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+    Val <<= 32;
+    break;
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi);
+    break;
+  case MCOI::OPERAND_REGISTER:
+    // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits
+    // decoding a literal in a position of a register operand. Give
+    // it special handling in the caller, decodeImmOperands(), instead
+    // of quietly allowing it here.
+    break;
   }
 
-  return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Val, getContext()))
-                  : MCOperand::createImm(Val);
+  return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
+                      LitModifier::Lit, Val, getContext()))
+                : MCOperand::createImm(Val);
 }
 
-MCOperand
-AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
+MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const {
   assert(STI.hasFeature(AMDGPU::Feature64BitLiterals));
 
   if (!HasLiteral) {
@@ -1606,25 +1649,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
                                Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal64 = eatBytes<uint64_t>(Bytes);
-  }
-
-  bool UseLit64 = false;
-  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
-  const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()];
-  if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
-      OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) {
-    UseLit64 = false;
-  } else {
-    assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
-           OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-           OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64);
-    UseLit64 = Hi_32(Literal64) == 0;
+    Literal = eatBytes<uint64_t>(Bytes);
   }
 
+  bool UseLit64 = Hi_32(Literal) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Literal64, getContext()))
-                  : MCOperand::createImm(Literal64);
+                        LitModifier::Lit64, Literal, getContext()))
+                  : MCOperand::createImm(Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1913,7 +1944,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst,
     return MCOperand::createImm(Val);
 
   if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
-    return decodeLiteral64Constant(Inst);
+    return decodeLiteral64Constant();
   }
 
   switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 27518577c3ebb..d103d79fdabb9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -44,8 +44,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   const unsigned HwModeRegClass;
   const unsigned TargetMaxInstBytes;
   mutable ArrayRef<uint8_t> Bytes;
-  mutable uint32_t Literal;
-  mutable uint64_t Literal64;
+  mutable uint64_t Literal;
   mutable bool HasLiteral;
   mutable std::optional<bool> EnableWavefrontSize32;
   unsigned CodeObjectVersion;
@@ -144,9 +143,8 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
   MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const;
   MCOperand decodeLiteralConstant(const MCInstrDesc &Desc,
-                                  const MCOperandInfo &OpDesc,
-                                  bool ExtendFP64) const;
-  MCOperand decodeLiteral64Constant(const MCInst &Inst) const;
+                                  const MCOperandInfo &OpDesc) const;
+  MCOperand decodeLiteral64Constant() const;
 
   MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 0f6e1cad5a1dd..eedfdb309d289 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1771,6 +1771,10 @@ defm RELAXED_DOT_ADD :
             "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
             "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
 
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
+
 //===----------------------------------------------------------------------===//
 // Relaxed BFloat16 dot product
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 481a9be8374ab..1fca466fdc54d 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1946,16 +1946,14 @@ static void addConstantComments(const MachineInstr *MI,
     CASE_ARITH_RM(PMADDUBSW) {
       unsigned SrcIdx = getSrcIdx(MI, 1);
       if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-        if (C->getType()->getScalarSizeInBits() == 8) {
-          std::string Comment;
-          raw_string_ostream CS(Comment);
-          unsigned VectorWidth =
-              X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
-          CS << "[";
-          printConstant(C, VectorWidth, CS);
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-        }
+        std::string Comment;
+        raw_string_ostream CS(Comment);
+        unsigned VectorWidth =
+            X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+        CS << "[";
+        printConstant(C, VectorWidth, CS);
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
       }
       break;
     }
@@ -1967,16 +1965,14 @@ static void addConstantComments(const MachineInstr *MI,
     CASE_ARITH_RM(PMULHRSW) {
       unsigned SrcIdx = getSrcIdx(MI, 1);
       if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-        if (C->getType()->getScalarSizeInBits() == 16) {
-          std::string Comment;
-          raw_string_ostream CS(Comment);
-          unsigned VectorWidth =
-              X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
-          CS << "[";
-          printConstant(C, VectorWidth, CS);
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-        }
+        std::string Comment;
+        raw_string_ostream CS(Comment);
+        unsigned VectorWidth =
+            X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+        CS << "[";
+        printConstant(C, VectorWidth, CS);
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
       }
       break;
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4c9b10a094981..cdc559b489e9d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
   Type *Ty = CI.getType();
 
-  if (auto *SrcC = dyn_cast<Constant>(Src))
-    if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL))
-      return replaceInstUsesWith(CI, Res);
+  if (Value *Res =
+          simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, Res);
 
   // Try to eliminate a cast of a cast.
   if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8d76b2d827754..7f5a41c311fc4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2122,9 +2122,13 @@ static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
 
   // Return true if we do not know how to (mechanically) hoist a given recipe
-  // out of a loop region. Does not address legality concerns such as aliasing
-  // or speculation safety.
+  // out of a loop region.
   auto CannotHoistRecipe = [](VPRecipeBase &R) {
+    // TODO: Relax checks in the future, e.g. we could also hoist reads, if
+    // their memory location is not modified in the vector loop.
+    if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
+      return true;
+
     // Allocas cannot be hoisted.
     auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
     return RepR && RepR->getOpcode() == Instruction::Alloca;
@@ -2132,17 +2136,18 @@ static void licm(VPlan &Plan) {
 
   // Hoist any loop invariant recipes from the vector loop region to the
   // preheader. Preform a shallow traversal of the vector loop region, to
-  // exclude recipes in replicate regions.
+  // exclude recipes in replicate regions. Since the top-level blocks in the
+  // vector loop region are guaranteed to execute if the vector pre-header is,
+  // we don't need to check speculation safety.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  assert(Preheader->getSingleSuccessor() == LoopRegion &&
+         "Expected vector prehader's successor to be the vector loop region");
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(LoopRegion->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       if (CannotHoistRecipe(R))
         continue;
-      // TODO: Relax checks in the future, e.g. we could also hoist reads, if
-      // their memory location is not modified in the vector loop.
-      if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
-          any_of(R.operands(), [](VPValue *Op) {
+      if (any_of(R.operands(), [](VPValue *Op) {
             return !Op->isDefinedOutsideLoopRegions();
           }))
         continue;
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 15ee6a02a1639..36655f6b781b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
   ret float %r
 }
 
+; No FMULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
+; CHECK-LABEL: fmulv_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    ret
+  %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
+  ret half %res
+}
+
+define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
+; CHECK-LABEL: fmulv_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    ret
+  %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
+  ret half %res
+}
+
+define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fmulv_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    ret
+  %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
+  ret half %res
+}
+
+define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
+; CHECK-LABEL: fmulv_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.s, #1.00000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    ret
+  %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
+  ret float %res
+}
+
+define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
+; CHECK-LABEL: fmulv_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.s, #1.00000000
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    ret
+  %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
+  ret float %res
+}
+
+define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fmulv_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.d, #1.00000000
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    ret
+  %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
+  ret double %res
+}
+
 declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
 declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
 declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
-declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
-declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
-declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
 declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
 declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
 declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
 declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
 declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
 declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)
+
+declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
+declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
+declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
+declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
+declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
+declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index be936f0fd6d4a..6fb0315e27fb5 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
   ret i64 %res
 }
 
+; No MULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: mulv_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: mulv_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: mulv_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: mulv_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
 ; Test widen vector reduce type
 declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
new file mode 100644
index 0000000000000..bf14dcf3c4fb3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
@@ -0,0 +1,88 @@
+# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# This loop has six stores, which exceeds the limit set by
+# `pipeliner-max-num-stores`.
+
+# CHECK: Too many stores
+
+--- |
+  target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+  target triple = "hexagon-unknown-linux-musl"
+  
+  define void @f(ptr %a, i32 %n) #0 {
+  entry:
+    %guard = icmp sgt i32 %n, 0
+    %btc = sub nsw i32 %n, 1
+    br i1 %guard, label %loop.preheader, label %exit
+  
+  loop.preheader:                                   ; preds = %entry
+    %0 = add i32 %n, 1
+    %cgep = getelementptr i8, ptr %a, i32 %0
+    br label %loop
+  
+  loop:                                             ; preds = %loop.preheader, %loop
+    %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ]
+    %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ]
+    %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2
+    store i8 0, ptr %cgep7, align 1
+    %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1
+    store i8 1, ptr %cgep8, align 1
+    store i8 2, ptr %lsr.iv, align 1
+    %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1
+    store i8 3, ptr %cgep9, align 1
+    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2
+    store i8 4, ptr %cgep10, align 1
+    %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3
+    store i8 5, ptr %cgep11, align 1
+    %i.dec = sub i32 %i, 1
+    %ec = icmp eq i32 %i.dec, 0
+    br i1 %ec, label %exit, label %loop
+  
+  exit:                                             ; preds = %loop, %entry
+    ret void
+  }
+  
+  attributes #0 = { "target-cpu"="hexagonv79" }
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $r0, $r1
+  
+    %7:intregs = COPY $r1
+    %6:intregs = COPY $r0
+    %8:predregs = C2_cmpgti %7, 0
+    J2_jumpf %8, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.loop.preheader:
+    successors: %bb.2(0x80000000)
+  
+    %0:intregs = A2_addi %7, -1
+    %1:intregs = S4_addaddi %7, %6, 1
+    %10:intregs = A2_tfrsi 0
+    %11:intregs = A2_tfrsi 1
+    %14:intregs = COPY %0
+    J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.loop (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %2:intregs = PHI %1, %bb.1, %4, %bb.2
+    S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7)
+    %4:intregs = A2_addi %2, -1
+    S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8)
+    S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv)
+    S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9)
+    S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10)
+    S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 04a2268db1755..314e1b4fc69a1 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
 
 target triple = "wasm32"
 
@@ -23,6 +24,10 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +52,109 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_s16:
 ; CHECK:    i32x4.load16x4_s 0:p2align=1
@@ -57,6 +165,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +230,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +295,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp6.not = icmp eq i32 %N, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +344,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +391,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +440,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +509,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
+
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +575,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp6.not = icmp eq i32 %N, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 55715197830b1..c90344b889b8c 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -502,11 +502,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,16,0,8,8,0,0,0,0,0,2,0,2,0,0,0]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -517,7 +517,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; SSE41-NEXT:    psubw %xmm1, %xmm0
-; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
@@ -530,7 +530,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
 ; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
@@ -541,7 +541,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; XOP-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
 ; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
@@ -630,7 +630,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0]
 ; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
@@ -641,7 +641,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [171,0,0,0]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm2
 ; SSE41-NEXT:    psrlw $7, %xmm2
@@ -654,7 +654,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; AVX-LABEL: combine_vec_udiv_nonuniform4:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [171,0,0,0]
 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX-NEXT:    vpackuswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpsrlw $7, %xmm1, %xmm1
@@ -691,7 +691,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SSE2-NEXT:    psubw %xmm3, %xmm0
 ; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768]
 ; SSE2-NEXT:    paddw %xmm3, %xmm0
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,u,0,4,0,4,16,0,4,0,0,4,0,0,0,16]
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c307a5759..4ec54d8143cef 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2071,7 +2071,7 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32]
 ; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index cc4bda81bef52..650b562e6cb5c 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 7c1a1e285ca05..874d88500c425 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 61740115c5fae..83a0ddb84ad18 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -5,9 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
 
@@ -598,6 +598,33 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512F-LABEL: var_shuffle_zero_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512F-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpcmpnleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
 ; AVX512VL-LABEL: var_shuffle_zero_v8i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
@@ -921,6 +948,28 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512F-LABEL: var_shuffle_zero_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
 ; AVX512VL-LABEL: var_shuffle_zero_v16i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 9b528574e1188..d16b28aefffac 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1872,7 +1872,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    paddw %xmm0, %xmm0
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
@@ -1964,7 +1964,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 983ae594e3ab1..3d85d5587a45f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -851,7 +851,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    paddw %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index d565ef01ececf..1602cde7a8e16 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1673,7 +1673,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
@@ -1750,7 +1750,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
 ; X86-SSE-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 8cb2c7be3b044..a847da6eff3ff 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1223,7 +1223,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1275,7 +1275,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 57874c4399277..eb39b6a0d2227 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1480,7 +1480,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1532,7 +1532,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
index 73653d0dd0067..6345b2f534f34 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
 
 s_mov_b64 s[2:3], 0x10abcdef12345678
 // GFX1250: s_mov_b64 s[2:3], 0x10abcdef12345678    ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
@@ -62,10 +62,8 @@ s_cselect_b64 s[2:3], s[4:5], 0x10abcdef12345678
 s_mov_b64 s[2:3], 0xffffffff01234567
 // GFX1250: s_mov_b64 s[2:3], 0xffffffff01234567    ; encoding: [0xfe,0x01,0x82,0xbe,0x67,0x45,0x23,0x01,0xff,0xff,0xff,0xff]
 
-// TODO: disasm
 s_mov_b64 s[2:3], lit64(0x777)
-// GFX1250-ASM: s_mov_b64 s[2:3], lit64(0x777)          ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
-// GFX1250-DIS: s_mov_b64 s[2:3], 0x777                 ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00]
+// GFX1250: s_mov_b64 s[2:3], lit64(0x777)          ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
 
 s_mov_b64 s[2:3], 0x777
 // GFX1250: s_mov_b64 s[2:3], 0x777                     ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index 0d61c1f50885c..39de9a268db95 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
 
 v_mov_b64_e32 v[4:5], v[2:3]
 // GFX1250: v_mov_b64_e32 v[4:5], v[2:3]            ; encoding: [0x02,0x3b,0x08,0x7e]
@@ -26,8 +26,10 @@ v_mov_b64 v[4:5], -1
 v_mov_b64 v[4:5], 0.5
 // GFX1250: v_mov_b64_e32 v[4:5], 0.5               ; encoding: [0xf0,0x3a,0x08,0x7e]
 
+// TODO: Encode as a 32-bit literal unless lit64() is specified.
 v_mov_b64 v[254:255], 0xaf123456
-// GFX1250: v_mov_b64_e32 v[254:255], 0xaf123456    ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: v_mov_b64_e32 v[254:255], 0xaf123456    ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 v_tanh_f32 v5, v1
 // GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index 02872b0da76dd..d9f69343fae3b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -196,8 +196,9 @@ v_add_nc_u64 v[4:5], -4.0, v[4:5]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_add_nc_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_add_nc_u64 v[4:5], 0x3f717273, v[4:5]
 // GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
@@ -316,8 +317,9 @@ v_sub_nc_u64 v[4:5], -4.0, v[4:5]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5]
 // GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
@@ -436,8 +438,9 @@ v_mul_u64 v[4:5], -4.0, v[4:5]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_mul_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_mul_u64 v[4:5], 0x3f717273, v[4:5]
 // GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index ad5771bbbafef..0548e9d24c113 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
 
 s_alloc_vgpr 0x1235
 // GFX12: s_alloc_vgpr 0x1235                     ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
@@ -860,7 +860,8 @@ s_mov_b64 s[0:1], 0x3f717273
 
 s_mov_b64 s[0:1], 0xaf123456
 // GFX1200: s_mov_b64 s[0:1], 0xaf123456            ; encoding: [0xff,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mov_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mov_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mov_b64 s[0:1], lit64(0xaf123456)     ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_mov_b64 s[0:1], null
 // GFX12: s_mov_b64 s[0:1], null                  ; encoding: [0x7c,0x01,0x80,0xbe]
@@ -969,7 +970,8 @@ s_cmov_b64 s[0:1], 0x3f717273
 
 s_cmov_b64 s[0:1], 0xaf123456
 // GFX1200: s_cmov_b64 s[0:1], 0xaf123456           ; encoding: [0xff,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmov_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmov_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmov_b64 s[0:1], lit64(0xaf123456)    ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_not_b32 s0, s1
 // GFX12: s_not_b32 s0, s1                        ; encoding: [0x01,0x1e,0x80,0xbe]
@@ -1072,7 +1074,8 @@ s_not_b64 s[0:1], 0x3f717273
 
 s_not_b64 s[0:1], 0xaf123456
 // GFX1200: s_not_b64 s[0:1], 0xaf123456            ; encoding: [0xff,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_not_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_not_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_not_b64 s[0:1], lit64(0xaf123456)     ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_wqm_b32 s0, s1
 // GFX12: s_wqm_b32 s0, s1                        ; encoding: [0x01,0x1c,0x80,0xbe]
@@ -1175,7 +1178,8 @@ s_wqm_b64 s[0:1], 0x3f717273
 
 s_wqm_b64 s[0:1], 0xaf123456
 // GFX1200: s_wqm_b64 s[0:1], 0xaf123456            ; encoding: [0xff,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_wqm_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_wqm_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_wqm_b64 s[0:1], lit64(0xaf123456)     ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_brev_b32 s0, s1
 // GFX12: s_brev_b32 s0, s1                       ; encoding: [0x01,0x04,0x80,0xbe]
@@ -1278,7 +1282,8 @@ s_brev_b64 s[0:1], 0x3f717273
 
 s_brev_b64 s[0:1], 0xaf123456
 // GFX1200: s_brev_b64 s[0:1], 0xaf123456           ; encoding: [0xff,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_brev_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_brev_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_brev_b64 s[0:1], lit64(0xaf123456)    ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bcnt0_i32_b32 s0, s1
 // GFX12: s_bcnt0_i32_b32 s0, s1                  ; encoding: [0x01,0x16,0x80,0xbe]
@@ -1390,7 +1395,8 @@ s_bcnt0_i32_b64 s0, 0x3f717273
 
 s_bcnt0_i32_b64 s0, 0xaf123456
 // GFX1200: s_bcnt0_i32_b64 s0, 0xaf123456          ; encoding: [0xff,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bcnt0_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bcnt0_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bcnt0_i32_b64 s0, lit64(0xaf123456)   ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bcnt1_i32_b32 s0, s1
 // GFX12: s_bcnt1_i32_b32 s0, s1                  ; encoding: [0x01,0x18,0x80,0xbe]
@@ -1502,7 +1508,8 @@ s_bcnt1_i32_b64 s0, 0x3f717273
 
 s_bcnt1_i32_b64 s0, 0xaf123456
 // GFX1200: s_bcnt1_i32_b64 s0, 0xaf123456          ; encoding: [0xff,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bcnt1_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bcnt1_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bcnt1_i32_b64 s0, lit64(0xaf123456)   ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_ff1_i32_b32 s0, s1
 // GFX12: s_ctz_i32_b32 s0, s1                    ; encoding: [0x01,0x08,0x80,0xbe]
@@ -1614,7 +1621,8 @@ s_ff1_i32_b64 s0, 0x3f717273
 
 s_ff1_i32_b64 s0, 0xaf123456
 // GFX1200: s_ctz_i32_b64 s0, 0xaf123456            ; encoding: [0xff,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ctz_i32_b64 s0, 0xaf123456            ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ctz_i32_b64 s0, 0xaf123456            ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ctz_i32_b64 s0, lit64(0xaf123456)     ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_flbit_i32_b32 s0, s1
 // GFX12: s_clz_i32_u32 s0, s1                    ; encoding: [0x01,0x0a,0x80,0xbe]
@@ -1726,7 +1734,8 @@ s_flbit_i32_b64 s0, 0x3f717273
 
 s_flbit_i32_b64 s0, 0xaf123456
 // GFX1200: s_clz_i32_u64 s0, 0xaf123456            ; encoding: [0xff,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_clz_i32_u64 s0, 0xaf123456            ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_clz_i32_u64 s0, 0xaf123456            ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_clz_i32_u64 s0, lit64(0xaf123456)     ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_flbit_i32 s0, s1
 // GFX12: s_cls_i32 s0, s1                        ; encoding: [0x01,0x0c,0x80,0xbe]
@@ -1838,7 +1847,8 @@ s_flbit_i32_i64 s0, 0x3f717273
 
 s_flbit_i32_i64 s0, 0xaf123456
 // GFX1200: s_cls_i32_i64 s0, 0xaf123456            ; encoding: [0xff,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cls_i32_i64 s0, 0xaf123456            ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cls_i32_i64 s0, 0xaf123456            ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cls_i32_i64 s0, lit64(0xaf123456)     ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_sext_i32_i8 s0, s1
 // GFX12: s_sext_i32_i8 s0, s1                    ; encoding: [0x01,0x0e,0x80,0xbe]
@@ -2284,7 +2294,8 @@ s_and_saveexec_b64 s[0:1], 0x3f717273
 
 s_and_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xff,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_or_saveexec_b64 s[0:1], s[2:3]        ; encoding: [0x02,0x23,0x80,0xbe]
@@ -2324,7 +2335,8 @@ s_or_saveexec_b64 s[0:1], 0x3f717273
 
 s_or_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_or_saveexec_b64 s[0:1], 0xaf123456    ; encoding: [0xff,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_saveexec_b64 s[0:1], 0xaf123456    ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_saveexec_b64 s[0:1], 0xaf123456    ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xor_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_xor_saveexec_b64 s[0:1], s[2:3]       ; encoding: [0x02,0x25,0x80,0xbe]
@@ -2364,7 +2376,8 @@ s_xor_saveexec_b64 s[0:1], 0x3f717273
 
 s_xor_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_xor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xff,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_and_not1_saveexec_b64 s[0:1], s[2:3]  ; encoding: [0x02,0x31,0x80,0xbe]
@@ -2404,7 +2417,8 @@ s_andn2_saveexec_b64 s[0:1], 0x3f717273
 
 s_andn2_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn2_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_or_not1_saveexec_b64 s[0:1], s[2:3]   ; encoding: [0x02,0x33,0x80,0xbe]
@@ -2444,7 +2458,8 @@ s_orn2_saveexec_b64 s[0:1], 0x3f717273
 
 s_orn2_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nand_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_nand_saveexec_b64 s[0:1], s[2:3]      ; encoding: [0x02,0x27,0x80,0xbe]
@@ -2484,7 +2499,8 @@ s_nand_saveexec_b64 s[0:1], 0x3f717273
 
 s_nand_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_nand_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xff,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nor_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_nor_saveexec_b64 s[0:1], s[2:3]       ; encoding: [0x02,0x29,0x80,0xbe]
@@ -2524,7 +2540,8 @@ s_nor_saveexec_b64 s[0:1], 0x3f717273
 
 s_nor_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_nor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xff,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xnor_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_xnor_saveexec_b64 s[0:1], s[2:3]      ; encoding: [0x02,0x2b,0x80,0xbe]
@@ -2564,7 +2581,8 @@ s_xnor_saveexec_b64 s[0:1], 0x3f717273
 
 s_xnor_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_xnor_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xff,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_quadmask_b32 s0, s1
 // GFX12: s_quadmask_b32 s0, s1                   ; encoding: [0x01,0x1a,0x80,0xbe]
@@ -2667,7 +2685,8 @@ s_quadmask_b64 s[0:1], 0x3f717273
 
 s_quadmask_b64 s[0:1], 0xaf123456
 // GFX1200: s_quadmask_b64 s[0:1], 0xaf123456       ; encoding: [0xff,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_quadmask_b64 s[0:1], 0xaf123456       ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_quadmask_b64 s[0:1], 0xaf123456       ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_quadmask_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_movrels_b32 s0, s1
 // GFX12: s_movrels_b32 s0, s1                    ; encoding: [0x01,0x40,0x80,0xbe]
@@ -2812,7 +2831,8 @@ s_movreld_b64 s[0:1], 0x3f717273
 
 s_movreld_b64 s[0:1], 0xaf123456
 // GFX1200: s_movreld_b64 s[0:1], 0xaf123456        ; encoding: [0xff,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_movreld_b64 s[0:1], 0xaf123456        ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_movreld_b64 s[0:1], 0xaf123456        ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_movreld_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_abs_i32 s0, s1
 // GFX12: s_abs_i32 s0, s1                        ; encoding: [0x01,0x15,0x80,0xbe]
@@ -2912,7 +2932,8 @@ s_andn1_saveexec_b64 s[0:1], 0x3f717273
 
 s_andn1_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn1_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_or_not0_saveexec_b64 s[0:1], s[2:3]   ; encoding: [0x02,0x2f,0x80,0xbe]
@@ -2952,7 +2973,8 @@ s_orn1_saveexec_b64 s[0:1], 0x3f717273
 
 s_orn1_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn1_wrexec_b64 s[0:1], s[2:3]
 // GFX12: s_and_not0_wrexec_b64 s[0:1], s[2:3]    ; encoding: [0x02,0x35,0x80,0xbe]
@@ -2992,7 +3014,8 @@ s_andn1_wrexec_b64 s[0:1], 0x3f717273
 
 s_andn1_wrexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_wrexec_b64 s[0:1], s[2:3]
 // GFX12: s_and_not1_wrexec_b64 s[0:1], s[2:3]    ; encoding: [0x02,0x37,0x80,0xbe]
@@ -3032,7 +3055,8 @@ s_andn2_wrexec_b64 s[0:1], 0x3f717273
 
 s_andn2_wrexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bitreplicate_b64_b32 s[0:1], s2
 // GFX12: s_bitreplicate_b64_b32 s[0:1], s2       ; encoding: [0x02,0x14,0x80,0xbe]
@@ -3831,7 +3855,8 @@ s_ctz_i32_b64 exec_hi, src_scc
 
 s_ctz_i32_b64 null, 0xaf123456
 // GFX1200: s_ctz_i32_b64 null, 0xaf123456          ; encoding: [0xff,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ctz_i32_b64 null, 0xaf123456          ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ctz_i32_b64 null, 0xaf123456          ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ctz_i32_b64 null, lit64(0xaf123456)   ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not1_saveexec_b64 s[10:11], s[2:3]
 // GFX12: s_and_not1_saveexec_b64 s[10:11], s[2:3] ; encoding: [0x02,0x31,0x8a,0xbe]
@@ -3859,7 +3884,8 @@ s_and_not1_saveexec_b64 ttmp[14:15], src_scc
 
 s_and_not1_saveexec_b64 null, 0xaf123456
 // GFX1200: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not0_saveexec_b32 s5, s1
 // GFX12: s_and_not0_saveexec_b32 s5, s1          ; encoding: [0x01,0x2c,0x85,0xbe]
@@ -3920,7 +3946,8 @@ s_and_not0_saveexec_b64 ttmp[14:15], src_scc
 
 s_and_not0_saveexec_b64 null, 0xaf123456
 // GFX1200: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not0_wrexec_b32 s5, s1
 // GFX12: s_and_not0_wrexec_b32 s5, s1            ; encoding: [0x01,0x34,0x85,0xbe]
@@ -3981,7 +4008,8 @@ s_and_not0_wrexec_b64 ttmp[14:15], src_scc
 
 s_and_not0_wrexec_b64 null, 0xaf123456
 // GFX1200: s_and_not0_wrexec_b64 null, 0xaf123456  ; encoding: [0xff,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not1_saveexec_b32 s5, s1
 // GFX12: s_and_not1_saveexec_b32 s5, s1          ; encoding: [0x01,0x30,0x85,0xbe]
@@ -4075,7 +4103,8 @@ s_and_not1_wrexec_b64 ttmp[14:15], src_scc
 
 s_and_not1_wrexec_b64 null, 0xaf123456
 // GFX1200: s_and_not1_wrexec_b64 null, 0xaf123456  ; encoding: [0xff,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_cls_i32 s5, s1
 // GFX12: s_cls_i32 s5, s1                        ; encoding: [0x01,0x0c,0x85,0xbe]
@@ -4145,7 +4174,8 @@ s_cls_i32_i64 exec_hi, src_scc
 
 s_cls_i32_i64 null, 0xaf123456
 // GFX1200: s_cls_i32_i64 null, 0xaf123456          ; encoding: [0xff,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cls_i32_i64 null, 0xaf123456          ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cls_i32_i64 null, 0xaf123456          ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cls_i32_i64 null, lit64(0xaf123456)   ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_clz_i32_u32 s5, s1
 // GFX12: s_clz_i32_u32 s5, s1                    ; encoding: [0x01,0x0a,0x85,0xbe]
@@ -4215,7 +4245,8 @@ s_clz_i32_u64 exec_hi, src_scc
 
 s_clz_i32_u64 null, 0xaf123456
 // GFX1200: s_clz_i32_u64 null, 0xaf123456          ; encoding: [0xff,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_clz_i32_u64 null, 0xaf123456          ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_clz_i32_u64 null, 0xaf123456          ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_clz_i32_u64 null, lit64(0xaf123456)   ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not0_saveexec_b32 s5, s1
 // GFX12: s_or_not0_saveexec_b32 s5, s1           ; encoding: [0x01,0x2e,0x85,0xbe]
@@ -4276,7 +4307,8 @@ s_or_not0_saveexec_b64 ttmp[14:15], src_scc
 
 s_or_not0_saveexec_b64 null, 0xaf123456
 // GFX1200: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not1_saveexec_b32 s5, s1
 // GFX12: s_or_not1_saveexec_b32 s5, s1           ; encoding: [0x01,0x32,0x85,0xbe]
@@ -4337,4 +4369,5 @@ s_or_not1_saveexec_b64 ttmp[14:15], src_scc
 
 s_or_not1_saveexec_b64 null, 0xaf123456
 // GFX1200: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
index 9c83879f0430e..3a24442312af6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s
 
 s_add_nc_u64 s[0:1], s[2:3], s[4:5]
 // GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x80,0xa9]
@@ -56,7 +56,8 @@ s_add_nc_u64 s[0:1], 0x3f717273, s[2:3]
 
 s_add_nc_u64 s[0:1], 0xaf123456, s[2:3]
 // GFX1200: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf]
-// GFX1250: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_add_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_add_nc_u64 s[0:1], s[2:3], exec
 // GFX12: s_add_nc_u64 s[0:1], s[2:3], exec       ; encoding: [0x02,0x7e,0x80,0xa9]
@@ -81,7 +82,8 @@ s_add_nc_u64 s[0:1], s[2:3], 0x3f717273
 
 s_add_nc_u64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0xa9,0x56,0x34,0x12,0xaf]
-// GFX1250: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_add_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_sub_nc_u64 s[0:1], s[2:3], s[4:5]
 // GFX12: s_sub_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x00,0xaa]
@@ -136,7 +138,8 @@ s_sub_nc_u64 s[0:1], 0x3f717273, s[2:3]
 
 s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3]
 // GFX1200: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_sub_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_sub_nc_u64 s[0:1], s[2:3], exec
 // GFX12: s_sub_nc_u64 s[0:1], s[2:3], exec       ; encoding: [0x02,0x7e,0x00,0xaa]
@@ -161,7 +164,8 @@ s_sub_nc_u64 s[0:1], s[2:3], 0x3f717273
 
 s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x00,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_sub_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_mul_u64 s[0:1], s[2:3], s[4:5]
 // GFX12: s_mul_u64 s[0:1], s[2:3], s[4:5]        ; encoding: [0x02,0x04,0x80,0xaa]
@@ -216,7 +220,8 @@ s_mul_u64 s[0:1], 0x3f717273, s[2:3]
 
 s_mul_u64 s[0:1], 0xaf123456, s[2:3]
 // GFX1200: s_mul_u64 s[0:1], 0xaf123456, s[2:3]    ; encoding: [0xff,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mul_u64 s[0:1], 0xaf123456, s[2:3]    ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mul_u64 s[0:1], 0xaf123456, s[2:3]    ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mul_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_mul_u64 s[0:1], s[2:3], exec
 // GFX12: s_mul_u64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0xaa]
@@ -241,7 +246,8 @@ s_mul_u64 s[0:1], s[2:3], 0x3f717273
 
 s_mul_u64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_mul_u64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mul_u64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mul_u64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mul_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_add_f32 s5, s1, s2
 // GFX12: s_add_f32 s5, s1, s2                    ; encoding: [0x01,0x02,0x05,0xa0]
@@ -2359,7 +2365,8 @@ s_cselect_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_cselect_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x98,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cselect_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_cselect_b64 s[0:1], s[2:3], exec
 // GFX12: s_cselect_b64 s[0:1], s[2:3], exec      ; encoding: [0x02,0x7e,0x80,0x98]
@@ -2384,7 +2391,8 @@ s_cselect_b64 s[0:1], s[2:3], 0x3f717273
 
 s_cselect_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x98,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cselect_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_b32 s0, s1, s2
 // GFX12: s_and_b32 s0, s1, s2                    ; encoding: [0x01,0x02,0x00,0x8b]
@@ -2553,7 +2561,8 @@ s_and_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_and_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_and_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xff,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_b64 s[0:1], s[2:3], exec
 // GFX12: s_and_b64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0x8b]
@@ -2578,7 +2587,8 @@ s_and_b64 s[0:1], s[2:3], 0x3f717273
 
 s_and_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_and_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0x8b,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_b32 s0, s1, s2
 // GFX12: s_or_b32 s0, s1, s2                     ; encoding: [0x01,0x02,0x00,0x8c]
@@ -2738,7 +2748,8 @@ s_or_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_or_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_or_b64 s[0:1], 0xaf123456, s[4:5]     ; encoding: [0xff,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_b64 s[0:1], 0xaf123456, s[4:5]     ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_b64 s[0:1], 0xaf123456, s[4:5]     ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_b64 s[0:1], s[2:3], exec
 // GFX12: s_or_b64 s[0:1], s[2:3], exec           ; encoding: [0x02,0x7e,0x80,0x8c]
@@ -2763,7 +2774,8 @@ s_or_b64 s[0:1], s[2:3], 0x3f717273
 
 s_or_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_or_b64 s[0:1], s[2:3], 0xaf123456     ; encoding: [0x02,0xff,0x80,0x8c,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_b64 s[0:1], s[2:3], 0xaf123456     ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_b64 s[0:1], s[2:3], 0xaf123456     ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xor_b32 s0, s1, s2
 // GFX12: s_xor_b32 s0, s1, s2                    ; encoding: [0x01,0x02,0x00,0x8d]
@@ -2923,7 +2935,8 @@ s_xor_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_xor_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_xor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xff,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xor_b64 s[0:1], s[2:3], exec
 // GFX12: s_xor_b64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0x8d]
@@ -2948,7 +2961,8 @@ s_xor_b64 s[0:1], s[2:3], 0x3f717273
 
 s_xor_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_xor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0x8d,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_b32 s0, s1, s2
 // GFX12: s_and_not1_b32 s0, s1, s2               ; encoding: [0x01,0x02,0x00,0x91]
@@ -3108,7 +3122,8 @@ s_andn2_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_andn2_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_b64 s[0:1], s[2:3], exec
 // GFX12: s_and_not1_b64 s[0:1], s[2:3], exec     ; encoding: [0x02,0x7e,0x80,0x91]
@@ -3133,7 +3148,8 @@ s_andn2_b64 s[0:1], s[2:3], 0x3f717273
 
 s_andn2_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn2_b32 s0, s1, s2
 // GFX12: s_or_not1_b32 s0, s1, s2                ; encoding: [0x01,0x02,0x00,0x92]
@@ -3293,7 +3309,8 @@ s_orn2_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_orn2_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn2_b64 s[0:1], s[2:3], exec
 // GFX12: s_or_not1_b64 s[0:1], s[2:3], exec      ; encoding: [0x02,0x7e,0x80,0x92]
@@ -3318,7 +3335,8 @@ s_orn2_b64 s[0:1], s[2:3], 0x3f717273
 
 s_orn2_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nand_b32 s0, s1, s2
 // GFX12: s_nand_b32 s0, s1, s2                   ; encoding: [0x01,0x02,0x00,0x8e]
@@ -3478,7 +3496,8 @@ s_nand_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_nand_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_nand_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xff,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nand_b64 s[0:1], s[2:3], exec
 // GFX12: s_nand_b64 s[0:1], s[2:3], exec         ; encoding: [0x02,0x7e,0x80,0x8e]
@@ -3503,7 +3522,8 @@ s_nand_b64 s[0:1], s[2:3], 0x3f717273
 
 s_nand_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_nand_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xff,0x80,0x8e,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nor_b32 s0, s1, s2
 // GFX12: s_nor_b32 s0, s1, s2                    ; encoding: [0x01,0x02,0x00,0x8f]
@@ -3663,7 +3683,8 @@ s_nor_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_nor_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_nor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xff,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nor_b64 s[0:1], s[2:3], exec
 // GFX12: s_nor_b64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0x8f]
@@ -3688,7 +3709,8 @@ s_nor_b64 s[0:1], s[2:3], 0x3f717273
 
 s_nor_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_nor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0x8f,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xnor_b32 s0, s1, s2
 // GFX12: s_xnor_b32 s0, s1, s2                   ; encoding: [0x01,0x02,0x00,0x90]
@@ -3848,7 +3870,8 @@ s_xnor_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_xnor_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_xnor_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xff,0x04,0x80,0x90,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xnor_b64 s[0:1], s[2:3], exec
 // GFX12: s_xnor_b64 s[0:1], s[2:3], exec         ; encoding: [0x02,0x7e,0x80,0x90]
@@ -3873,7 +3896,8 @@ s_xnor_b64 s[0:1], s[2:3], 0x3f717273
 
 s_xnor_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_xnor_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xff,0x80,0x90,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_lshl_b32 s0, s1, s2
 // GFX12: s_lshl_b32 s0, s1, s2                   ; encoding: [0x01,0x02,0x00,0x84]
@@ -4033,7 +4057,8 @@ s_lshl_b64 s[0:1], 0x3f717273, s4
 
 s_lshl_b64 s[0:1], 0xaf123456, s4
 // GFX1200: s_lshl_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xff,0x04,0x80,0x84,0x56,0x34,0x12,0xaf]
-// GFX1250: s_lshl_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_lshl_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_lshl_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_lshl_b64 s[0:1], s[2:3], exec_lo
 // GFX12: s_lshl_b64 s[0:1], s[2:3], exec_lo      ; encoding: [0x02,0x7e,0x80,0x84]
@@ -4217,7 +4242,8 @@ s_lshr_b64 s[0:1], 0x3f717273, s4
 
 s_lshr_b64 s[0:1], 0xaf123456, s4
 // GFX1200: s_lshr_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xff,0x04,0x80,0x85,0x56,0x34,0x12,0xaf]
-// GFX1250: s_lshr_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_lshr_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_lshr_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_lshr_b64 s[0:1], s[2:3], exec_lo
 // GFX12: s_lshr_b64 s[0:1], s[2:3], exec_lo      ; encoding: [0x02,0x7e,0x80,0x85]
@@ -4401,7 +4427,8 @@ s_ashr_i64 s[0:1], 0x3f717273, s4
 
 s_ashr_i64 s[0:1], 0xaf123456, s4
 // GFX1200: s_ashr_i64 s[0:1], 0xaf123456, s4       ; encoding: [0xff,0x04,0x80,0x86,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ashr_i64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ashr_i64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ashr_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_ashr_i64 s[0:1], s[2:3], exec_lo
 // GFX12: s_ashr_i64 s[0:1], s[2:3], exec_lo      ; encoding: [0x02,0x7e,0x80,0x86]
@@ -4996,7 +5023,8 @@ s_bfe_u64 s[0:1], 0x3f717273, s4
 
 s_bfe_u64 s[0:1], 0xaf123456, s4
 // GFX1200: s_bfe_u64 s[0:1], 0xaf123456, s4        ; encoding: [0xff,0x04,0x00,0x94,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bfe_u64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bfe_u64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bfe_u64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bfe_u64 s[0:1], s[2:3], exec_lo
 // GFX12: s_bfe_u64 s[0:1], s[2:3], exec_lo       ; encoding: [0x02,0x7e,0x00,0x94]
@@ -5075,7 +5103,8 @@ s_bfe_i64 s[0:1], 0x3f717273, s4
 
 s_bfe_i64 s[0:1], 0xaf123456, s4
 // GFX1200: s_bfe_i64 s[0:1], 0xaf123456, s4        ; encoding: [0xff,0x04,0x80,0x94,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bfe_i64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bfe_i64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bfe_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bfe_i64 s[0:1], s[2:3], exec_lo
 // GFX12: s_bfe_i64 s[0:1], s[2:3], exec_lo       ; encoding: [0x02,0x7e,0x80,0x94]
@@ -6279,7 +6308,8 @@ s_and_not1_b64 s[10:11], vcc, ttmp[14:15]
 
 s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456
 // GFX1200: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not1_b64 s[10:11], exec, src_scc
 // GFX12: s_and_not1_b64 s[10:11], exec, src_scc  ; encoding: [0x7e,0xfd,0x8a,0x91]
@@ -6298,7 +6328,8 @@ s_and_not1_b64 exec, src_scc, exec
 
 s_and_not1_b64 null, 0xaf123456, vcc
 // GFX1200: s_and_not1_b64 null, 0xaf123456, vcc    ; encoding: [0xff,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 null, 0xaf123456, vcc    ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 null, 0xaf123456, vcc    ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not1_b64 s[10:11], s[2:3], s[4:5]
 // GFX12: s_or_not1_b64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x02,0x04,0x8a,0x92]
@@ -6311,7 +6342,8 @@ s_or_not1_b64 s[10:11], vcc, ttmp[14:15]
 
 s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456
 // GFX1200: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not1_b64 s[10:11], exec, src_scc
 // GFX12: s_or_not1_b64 s[10:11], exec, src_scc   ; encoding: [0x7e,0xfd,0x8a,0x92]
@@ -6330,4 +6362,5 @@ s_or_not1_b64 exec, src_scc, exec
 
 s_or_not1_b64 null, 0xaf123456, vcc
 // GFX1200: s_or_not1_b64 null, 0xaf123456, vcc     ; encoding: [0xff,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 null, 0xaf123456, vcc     ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 null, 0xaf123456, vcc     ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
index 98bb3c3e1da95..8056cef973ecf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s
 
 s_cmp_lt_f32 s1, s2
 // GFX12: s_cmp_lt_f32 s1, s2                     ; encoding: [0x01,0x02,0x41,0xbf]
@@ -2120,7 +2120,8 @@ s_cmp_eq_u64 s[0:1], 0x3f717273
 
 s_cmp_eq_u64 s[0:1], 0xaf123456
 // GFX1200: s_cmp_eq_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmp_eq_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmp_eq_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmp_eq_u64 s[0:1], lit64(0xaf123456)  ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_cmp_lg_u64 s[0:1], s[2:3]
 // GFX12: s_cmp_lg_u64 s[0:1], s[2:3]             ; encoding: [0x00,0x02,0x11,0xbf]
@@ -2163,4 +2164,5 @@ s_cmp_lg_u64 s[0:1], 0x3f717273
 
 s_cmp_lg_u64 s[0:1], 0xaf123456
 // GFX1200: s_cmp_lg_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xff,0x11,0xbf,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmp_lg_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmp_lg_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmp_lg_u64 s[0:1], lit64(0xaf123456)  ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/lit.local.cfg b/llvm/test/MC/AMDGPU/lit.local.cfg
index c5853ad178a04..12a5c8afa7709 100644
--- a/llvm/test/MC/AMDGPU/lit.local.cfg
+++ b/llvm/test/MC/AMDGPU/lit.local.cfg
@@ -1,4 +1,4 @@
-config.substitutions.append(("%extract-encodings", "sed 's/.*encoding://p'"))
+config.substitutions.append(("%extract-encodings", "sed -n 's/.*encoding://p'"))
 
 if not "AMDGPU" in config.root.targets:
     config.unsupported = True
diff --git a/llvm/test/MC/AMDGPU/offset-expr.s b/llvm/test/MC/AMDGPU/offset-expr.s
index 92a9bf1b4ce9a..7c3c71c1645aa 100644
--- a/llvm/test/MC/AMDGPU/offset-expr.s
+++ b/llvm/test/MC/AMDGPU/offset-expr.s
@@ -9,10 +9,10 @@ BB1:
 v_nop_e64
 BB2:
 s_add_u32 vcc_lo, vcc_lo, (BB2-BB1)&4294967295
-// CHECK: s_add_u32 vcc_lo, vcc_lo, 8   // 000000000018: 806AFF6A 00000008
+// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0x8) // 000000000018: 806AFF6A 00000008
 s_addc_u32 vcc_hi, vcc_hi, (BB2-BB1)>>32
-// CHECK: s_addc_u32 vcc_hi, vcc_hi, 0  // 000000000020: 826BFF6B 00000000
+// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0x0) // 000000000020: 826BFF6B 00000000
 s_add_u32 vcc_lo, vcc_lo, (BB0-BB1)&4294967295
-// CHECK: s_add_u32 vcc_lo, vcc_lo, -16 // 000000000028: 806AFF6A FFFFFFF0
+// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0xfffffff0) // 000000000028: 806AFF6A FFFFFFF0
 s_addc_u32 vcc_hi, vcc_hi, (BB0-BB1)>>32
-// CHECK: s_addc_u32 vcc_hi, vcc_hi, -1 // 000000000030: 826BFF6B FFFFFFFF
+// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0xffffffff) // 000000000030: 826BFF6B FFFFFFFF
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
index d2da087a44743..856d7c22177ff 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
@@ -40,8 +40,7 @@
 # VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
 0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
 
-# FIXME: This should be able to round trip with literal after instruction
-# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
+# VI: v_add_f16_e32 v1, lit(0x0), v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x00,0x00,0x00]
 0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
 
 # VI: v_add_f16_e32 v1, 0xffcd, v3 ; encoding: [0xff,0x06,0x02,0x3e,0xcd,0xff,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/literals.txt b/llvm/test/MC/Disassembler/AMDGPU/literals.txt
new file mode 100644
index 0000000000000..bd013a15fa23c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/literals.txt
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_bf16_e32 v127.l, lit(0x1)        ; encoding: [0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00]
+
+0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_add_bf16 v255, lit(0x1), vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00]
+
+0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f16_e32 v127.l, lit(0x1)         ; encoding: [0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00]
+
+0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_fmac_f16 v255, lit(0x1), v255      ; encoding: [0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00]
+
+# The immediate is always literal in this instruction.
+0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_bf8_f16 v1.l, 1                ; encoding: [0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
+
+0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_bf8 v1, lit(0x1)           ; encoding: [0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00]
+
+0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_add_min_i16 v10, lit(0x1), v2, v3  ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00]
+
+0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e32 v255, lit(0x1)           ; encoding: [0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00]
+
+0xff,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00
+# GFX1250: v_mov_b64_e32 v[254:255], lit(0x1)      ; encoding: [0xfe,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
index 7cc4446f1038b..ad45d1e3e3e4f 100644
--- a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
+++ b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
@@ -11,10 +11,8 @@ define i16 @test5(i16 %A) !dbg !34 {
   call void @llvm.dbg.value(metadata i32 %C, metadata !37, metadata !DIExpression()), !dbg !41
 
   ; Preserve the dbg.value for the DCE'd 32-bit 'and'.
-  ;
-  ; The high 16 bits of the original 'and' require sign-extending the new 16-bit and:
   ; CHECK-NEXT: #dbg_value(i16 [[and]], [[C:![0-9]+]],
-  ; CHECK-SAME:    !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value)
+  ; CHECK-SAME:    !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)
 
   %D = trunc i32 %C to i16, !dbg !42
   call void @llvm.dbg.value(metadata i16 %D, metadata !38, metadata !DIExpression()), !dbg !42
diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
index 69b8f6953d61e..82ecbd41e50a7 100644
--- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
@@ -86,3 +86,14 @@ define <4 x ptr> @test7(<4 x i128> %arg) nounwind {
   %p1 = inttoptr <4 x i128> %arg to <4 x ptr>
   ret <4 x ptr> %p1
 }
+
+define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) {
+; CHECK-LABEL: @ptrtoint_gep_sub(
+; CHECK-NEXT:    ret i64 [[END_ADDR:%.*]]
+;
+  %ptr.addr = ptrtoint ptr %ptr to i64
+  %size = sub i64 %end.addr, %ptr.addr
+  %end = getelementptr i8, ptr %ptr, i64 %size
+  %end.addr2 = ptrtoint ptr %end to i64
+  ret i64 %end.addr2
+}
diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index 279d4e82ccb85..83623fd82bb4a 100644
--- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK-LABEL: @foo(
 ;CHECK: icmp eq <4 x i32>
 ;CHECK: select <4 x i1>
-;CHECK: ret i32
-define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
+;CHECK: ret void
+define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
 entry:
   %cmp10 = icmp sgt i32 %x, 0
   br i1 %cmp10, label %for.body, label %for.end
@@ -35,5 +35,5 @@ if.end:                                           ; preds = %for.body, %if.then
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %if.end, %entry
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e9f094d..d0c11946c9deb 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -36,7 +36,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %exit, label %for.body
 }
 
-define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
+define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 
 ; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032
@@ -70,7 +70,7 @@ for.cond.cleanup.loopexit:                        ; preds = %if.end
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret i32 undef
+  ret void
 
 for.body:                                         ; preds = %for.body.preheader, %if.end
   %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
index 9e205863b8367..44fb8cb2f3beb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK-LABEL: @read_mod_write_single_ptr(
 ; CHECK: load <8 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
 
 ; CHECK-LABEL: @read_mod_i64(
 ; SLOWMEM32: load <2 x i64>
 ; FASTMEM32: load <4 x i64>
-; CHECK: ret i32
-define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 6d2cda48f90ca..0287645d9d7f9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
   %1 = icmp sgt i32 %n, 3
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
-define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
   %1 = icmp sgt i32 %n, 9
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index af5c921c29149..fa3b4a6609d4c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux"
 ;CHECK-LABEL: func1x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
 entry:
   br label %for.body
 
@@ -40,14 +40,14 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  ret i32 undef
+  ret void
 }
 
 ; We are vectorizing with 12 runtime checks.
 ;CHECK-LABEL: func2x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
 entry:
   br label %for.body
 
@@ -85,5 +85,5 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
index 8971dfe507240..47355e7d9dafd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK-NOUNRL: store <4 x i32>
 ;CHECK-NOUNRL-NOT: store <4 x i32>
 ;CHECK-NOUNRL: ret
-define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
+define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index f64255f29d335..b7aa958958734 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; When scalarizing stores we need to preserve the original order.
 ; Make sure that we are extracting in the correct order (0101, and not 0011).
 
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -55,7 +55,7 @@ for.body:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 1588d02eff3db..51255b2e35707 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp26 = icmp sgt i32 %n, 0
@@ -106,11 +106,11 @@ if.end14:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 ; As above but with multiple variables set per block.
-define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @multi_variable_if_nest(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp26 = icmp sgt i32 %n, 0
@@ -224,5 +224,5 @@ if.end14:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
index 8a7f4a386fda1..a88a9b1466149 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;  }
 ;}
 
-define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
-; CHECK-LABEL: define i32 @function0(
+define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+; CHECK-LABEL: define void @function0(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]]
@@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end)
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp16 = icmp slt i32 %start, %end
@@ -127,7 +127,7 @@ if.end:
   br i1 %cmp, label %for.body, label %for.end
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 
@@ -237,6 +237,8 @@ for.end:                                          ; preds = %for.inc, %entry
 ; Handle PHI with single incoming value having a full mask.
 ; PR34523
 
+; NOTE: Changing PHI inputs from undef to poison leads to change in
+; behaviour of the test. Left as undef for now.
 define void @PR34523() {
 ; CHECK-LABEL: define void @PR34523() {
 ; CHECK-NEXT:  [[BB1:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 742ee649db3a2..eea22374ade30 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -337,7 +337,7 @@ for.end:                                          ; preds = %for.body
 ;    }
 ;  }
 
-define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK:       for.end10.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END10]]
 ; CHECK:       for.end10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
@@ -469,12 +469,12 @@ for.inc8:                                         ; preds = %for.body3, %for.con
   br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
 
 for.end10:                                        ; preds = %for.inc8, %entry
-  ret i32 undef
+  ret void
 }
 
 ; second uniform store to the same address is conditional.
 ; we do not vectorize this.
-define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores_conditional(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu
 ; CHECK:       for.end10.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END10]]
 ; CHECK:       for.end10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
@@ -567,7 +567,7 @@ for.inc8:                                         ; preds = %for.body3, %for.con
   br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
 
 for.end10:                                        ; preds = %for.inc8, %entry
-  ret i32 undef
+  ret void
 }
 
 ; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
index b891b4312f18d..d9d9eec9b7d33 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -132,7 +132,7 @@ for.end:
 ; CHECK-LABEL: @f6
 ; CHECK-NOT: <2 x i32>
 
-define i32 @f6(ptr %a, i32 %tmp) {
+define void @f6(ptr %a, i32 %tmp) {
 entry:
   br label %for.body
 
@@ -149,7 +149,7 @@ for.body:
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 ; Don't vectorize true loop carried dependencies that are not a multiple of the
diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
index d700d484c56bc..f5e480c910c33 100644
--- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -10,7 +10,7 @@
 ; CHECK: store i64 %indvars.outer, ptr %O2, align 4
 
 
-define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
@@ -50,5 +50,5 @@ for.end.outer.loopexit:                           ; preds = %for.end.inner
   br label %for.end.outer
 
 for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
-  ret i64 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll
index ad7f6e7b16b16..0a9c8c1504055 100644
--- a/llvm/test/Transforms/LoopVectorize/pr28541.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll
@@ -28,7 +28,7 @@
 ; CHECK-NOT: vectorized loop
 ; CHECK-LABEL: fn1
 
-define i32 @fn1() {
+define void @fn1() {
 entry:
   %tmp2 = load i32, ptr @b, align 4
   %dec3 = add nsw i32 %tmp2, -1
@@ -67,5 +67,5 @@ while.cond.while.end_crit_edge:                   ; preds = %while.cond
   br label %while.end
 
 while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index f87be5a115044..6ea227f7492ee 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;     a[i] = b[i] * 3;
 ; }
 
-define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
+define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]]
@@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]], !dbg [[DBG14:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef, !dbg [[DBG14]]
+; CHECK-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; FORCED_OPTSIZE-LABEL: @foo(
 ; FORCED_OPTSIZE-NEXT:  entry:
@@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
 ; FORCED_OPTSIZE:       for.end.loopexit:
 ; FORCED_OPTSIZE-NEXT:    br label [[FOR_END]], !dbg [[DBG10:![0-9]+]]
 ; FORCED_OPTSIZE:       for.end:
-; FORCED_OPTSIZE-NEXT:    ret i32 undef, !dbg [[DBG10]]
+; FORCED_OPTSIZE-NEXT:    ret void, !dbg [[DBG10]]
 ;
 entry:
   %cmp6 = icmp sgt i32 %n, 0, !dbg !6
@@ -99,7 +99,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.end, label %for.body, !dbg !7
 
 for.end:                                          ; preds = %for.body, %entry
-  ret i32 undef, !dbg !8
+  ret void, !dbg !8
 }
 
 ; Make sure that we try to vectorize loops with a runtime check if the
@@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]]
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4
 ; CHECK-NEXT:    [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll
index cc21b94bc3070..8df71e8394874 100644
--- a/llvm/test/Transforms/LoopVectorize/write-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/write-only.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @read_mod_write_single_ptr(
 ;CHECK: load <4 x float>
-;CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+;CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
 ; Ensure that volatile stores are not vectorized.
 ; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store(
 ; CHECK-NOT: store <4 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
index f1ffcc788a019..239397bb652ca 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
@@ -17,7 +17,7 @@
 
 define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
 ; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_
-; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] {
+; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0
 ; CHECK-NEXT:    br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]]
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index b7f898ff23eba..79216e89c7cba 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ExecutionEngine/Orc/SectCreate.h"
 #include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h"
@@ -312,10 +313,19 @@ static cl::opt<bool>
                                cl::desc("Show FailedToMaterialize errors"),
                                cl::init(false), cl::cat(JITLinkCategory));
 
-static cl::opt<bool> UseSharedMemory(
-    "use-shared-memory",
-    cl::desc("Use shared memory to transfer generated code and data"),
-    cl::init(false), cl::cat(JITLinkCategory));
+enum class MemMgr { Default, Generic, SimpleRemote, Shared };
+
+static cl::opt<MemMgr> UseMemMgr(
+    "use-memmgr", cl::desc("Choose memory manager"), cl::init(MemMgr::Generic),
+    cl::values(clEnumValN(MemMgr::Default, "default",
+                          "Use setup default (InProcess or EPCGeneric)"),
+               clEnumValN(MemMgr::Generic, "generic",
+                          "Generic remote memory manager"),
+               clEnumValN(MemMgr::SimpleRemote, "simple-remote",
+                          "Mapper memory manager with simple-remote backend"),
+               clEnumValN(MemMgr::Shared, "shared",
+                          "Mapper memory manager with shared-memory manager")),
+    cl::cat(JITLinkCategory));
 
 static cl::opt<std::string>
     OverrideTriple("triple", cl::desc("Override target triple detection"),
@@ -717,6 +727,27 @@ static std::unique_ptr<JITLinkMemoryManager> createInProcessMemoryManager() {
           SlabSize));
 }
 
+Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
+createSimpleRemoteMemoryManager(SimpleRemoteEPC &SREPC) {
+  SimpleRemoteMemoryMapper::SymbolAddrs SAs;
+  if (auto Err = SREPC.getBootstrapSymbols(
+          {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
+           {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Deinitialize,
+            rt::SimpleExecutorMemoryManagerDeinitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
+    return std::move(Err);
+#ifdef _WIN32
+  size_t SlabSize = 1024 * 1024;
+#else
+  size_t SlabSize = 1024 * 1024 * 1024;
+#endif
+  return MapperJITLinkMemoryManager::CreateWithMapper<SimpleRemoteMemoryMapper>(
+      SlabSize, SREPC, SAs);
+}
+
 Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
   SharedMemoryMapper::SymbolAddrs SAs;
@@ -745,6 +776,19 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
       SlabSize, SREPC, SAs);
 }
 
+static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
+  switch (UseMemMgr) {
+  case MemMgr::Default:
+  case MemMgr::Generic:
+    break;
+  case MemMgr::SimpleRemote:
+    S.CreateMemoryManager = createSimpleRemoteMemoryManager;
+    break;
+  case MemMgr::Shared:
+    S.CreateMemoryManager = createSharedMemoryManager;
+    break;
+  }
+}
 
 static Expected<MaterializationUnit::Interface>
 getTestObjectFileInterface(Session &S, MemoryBufferRef O) {
@@ -904,8 +948,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> launchExecutor() {
   close(FromExecutor[WriteEnd]);
 
   auto S = SimpleRemoteEPC::Setup();
-  if (UseSharedMemory)
-    S.CreateMemoryManager = createSharedMemoryManager;
+  setupEPCRemoteMemoryManager(S);
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
       std::make_unique<DynamicThreadPoolTaskDispatcher>(MaterializationThreads),
@@ -994,8 +1037,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> connectToExecutor() {
     return SockFD.takeError();
 
   auto S = SimpleRemoteEPC::Setup();
-  if (UseSharedMemory)
-    S.CreateMemoryManager = createSharedMemoryManager;
+  setupEPCRemoteMemoryManager(S);
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
       std::make_unique<DynamicThreadPoolTaskDispatcher>(std::nullopt),
diff --git a/llvm/utils/Misc/zkill b/llvm/utils/Misc/zkill
index bc0bfd586f7a4..8e10144ed4ac9 100755
--- a/llvm/utils/Misc/zkill
+++ b/llvm/utils/Misc/zkill
@@ -14,7 +14,7 @@ def _write_message(kind, message):
     file,line,_,_,_ = inspect.getframeinfo(f)
     location = '%s:%d' % (os.path.basename(file), line)
 
-    print >>sys.stderr, '%s: %s: %s' % (location, kind, message)
+    print('%s: %s: %s' % (location, kind, message), file=sys.stderr)
 
 note = lambda message: _write_message('note', message)
 warning = lambda message: _write_message('warning', message)
@@ -53,7 +53,7 @@ def extractExecutable(command):
 
 class Struct:
     def __init__(self, **kwargs):
-        self.fields = kwargs.keys()
+        self.fields = list(kwargs.keys())
         self.__dict__.update(kwargs)
 
     def __repr__(self):
@@ -144,7 +144,7 @@ def main():
     parser.add_option("-s", "", dest="signalName",
                       help="Name of the signal to use (default=%default)",
                       action="store", default='INT',
-                      choices=kSignals.keys())
+                      choices=list(kSignals.keys()))
     parser.add_option("-l", "", dest="listSignals",
                       help="List known signal names",
                       action="store_true", default=False)
@@ -202,18 +202,18 @@ def main():
     (opts, args) = parser.parse_args()
 
     if opts.listSignals:
-        items = [(v,k) for k,v in kSignals.items()]
+        items = [(v,k) for k,v in list(kSignals.items())]
         items.sort()
         for i in range(0, len(items), 4):
-            print '\t'.join(['%2d) SIG%s' % (k,v)
-                             for k,v in items[i:i+4]])
+            print('\t'.join(['%2d) SIG%s' % (k,v)
+                             for k,v in items[i:i+4]]))
         sys.exit(0)
 
     # Figure out the signal to use.
     signal = kSignals[opts.signalName]
     signalValueName = str(signal)
     if opts.verbose:
-        name = dict((v,k) for k,v in kSignals.items()).get(signal,None)
+        name = dict((v,k) for k,v in list(kSignals.items())).get(signal,None)
         if name:
             signalValueName = name
             note('using signal %d (SIG%s)' % (signal, name))
diff --git a/llvm/utils/clang-parse-diagnostics-file b/llvm/utils/clang-parse-diagnostics-file
index 1f720c34544a7..fac5866d364a8 100755
--- a/llvm/utils/clang-parse-diagnostics-file
+++ b/llvm/utils/clang-parse-diagnostics-file
@@ -87,14 +87,14 @@ Utility for dumping Clang-style logged diagnostics.\
         return
 
     # Otherwise, print out the diagnostics.
-    print
-    print "**** BUILD DIAGNOSTICS ****"
+    print()
+    print("**** BUILD DIAGNOSTICS ****")
     for file,selected_diags in to_report:
-        print "*** %s ***" % file
+        print(("*** %s ***" % file))
         for d in selected_diags:
-            print " %s:%s:%s: %s: %s" % (
+            print((" %s:%s:%s: %s: %s" % (
                 d.get('filename'), d.get('line'), d.get('column'),
-                d.get('level'), d.get('message'))
+                d.get('level'), d.get('message'))))
 
 if __name__ == "__main__":
     main()
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index 9b69a4435a562..84384217897c4 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -68,6 +68,7 @@ static_library("Orc") {
     "SectCreate.cpp",
     "SelfExecutorProcessControl.cpp",
     "SimpleRemoteEPC.cpp",
+    "SimpleRemoteMemoryMapper.cpp",
     "SpeculateAnalyses.cpp",
     "Speculation.cpp",
     "TaskDispatch.cpp",
diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index 9639aa0dc44b4..4afb41d4060f0 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -21,11 +21,7 @@
 
 import sys
 import re
-
-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
+from urllib.request import urlopen
 
 
 # This variable will body of the mappings function
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index b67e4cb435e55..d0811a282c816 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1986,6 +1986,7 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
     OptionalAttr<StrAttr>:$instrument_function_exit,
     OptionalAttr<UnitAttr>:$no_inline,
     OptionalAttr<UnitAttr>:$always_inline,
+    OptionalAttr<UnitAttr>:$inline_hint,
     OptionalAttr<UnitAttr>:$no_unwind,
     OptionalAttr<UnitAttr>:$will_return,
     OptionalAttr<UnitAttr>:$optimize_none,
@@ -2038,6 +2039,9 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
     /// Returns true if the `always_inline` attribute is set, false otherwise.
     bool isAlwaysInline() { return bool(getAlwaysInlineAttr()); }
 
+    /// Returns true if the `inline_hint` attribute is set, false otherwise.
+    bool isInlineHint() { return bool(getInlineHintAttr()); }
+
     /// Returns true if the `optimize_none` attribute is set, false otherwise.
     bool isOptimizeNone() { return bool(getOptimizeNoneAttr()); }
   }];
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
index 10491f65d37af..4ecf03c34c1a5 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
@@ -50,28 +50,63 @@ TargetEnvAttr getDefaultTargetEnv(MLIRContext *context);
 /// returned by getDefaultTargetEnv() if not provided.
 TargetEnvAttr lookupTargetEnvOrDefault(Operation *op);
 
+/// A thin wrapper around the SpecificationVersion enum to represent
+/// and provide utilities around the TOSA specification version.
+class TosaSpecificationVersion {
+public:
+  TosaSpecificationVersion(uint32_t major, uint32_t minor)
+      : majorVersion(major), minorVersion(minor) {}
+  TosaSpecificationVersion(SpecificationVersion version)
+      : TosaSpecificationVersion(fromVersionEnum(version)) {}
+
+  bool isBackwardsCompatibleWith(TosaSpecificationVersion baseVersion) const {
+    return this->majorVersion == baseVersion.majorVersion &&
+           this->minorVersion >= baseVersion.minorVersion;
+  }
+
+  uint32_t getMajor() const { return majorVersion; }
+  uint32_t getMinor() const { return minorVersion; }
+
+private:
+  uint32_t majorVersion = 0;
+  uint32_t minorVersion = 0;
+
+  static TosaSpecificationVersion
+  fromVersionEnum(SpecificationVersion version) {
+    switch (version) {
+    case SpecificationVersion::V_1_0:
+      return TosaSpecificationVersion(1, 0);
+    case SpecificationVersion::V_1_1_DRAFT:
+      return TosaSpecificationVersion(1, 1);
+    }
+    llvm_unreachable("Unknown TOSA version");
+  }
+};
+
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version);
+
 /// This class represents the capability enabled in the target implementation
 /// such as profile, extension, and level. It's a wrapper class around
 /// tosa::TargetEnvAttr.
 class TargetEnv {
 public:
   TargetEnv() {}
-  explicit TargetEnv(Level level, const ArrayRef<Profile> &profiles,
+  explicit TargetEnv(SpecificationVersion specificationVersion, Level level,
+                     const ArrayRef<Profile> &profiles,
                      const ArrayRef<Extension> &extensions)
-      : level(level) {
+      : specificationVersion(specificationVersion), level(level) {
     enabledProfiles.insert_range(profiles);
     enabledExtensions.insert_range(extensions);
   }
 
   explicit TargetEnv(TargetEnvAttr targetAttr)
-      : TargetEnv(targetAttr.getLevel(), targetAttr.getProfiles(),
-                  targetAttr.getExtensions()) {}
+      : TargetEnv(targetAttr.getSpecificationVersion(), targetAttr.getLevel(),
+                  targetAttr.getProfiles(), targetAttr.getExtensions()) {}
 
   void addProfile(Profile p) { enabledProfiles.insert(p); }
   void addExtension(Extension e) { enabledExtensions.insert(e); }
 
-  // TODO implement the following utilities.
-  // Version getSpecVersion() const;
+  SpecificationVersion getSpecVersion() const { return specificationVersion; }
 
   TosaLevel getLevel() const {
     if (level == Level::eightK)
@@ -105,6 +140,7 @@ class TargetEnv {
   }
 
 private:
+  SpecificationVersion specificationVersion;
   Level level;
   llvm::SmallSet<Profile, 3> enabledProfiles;
   llvm::SmallSet<Extension, 13> enabledExtensions;
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index 1f718accabd15..c1b5e785bd739 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -2,441 +2,779 @@
 // `tools/genspec.py` in https://git.mlplatform.org/tosa/specification.git
 profileComplianceMap = {
     {"tosa.argmax",
-     {{{Profile::pro_int}, {{i8T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, i32T}, {fp32T, i32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, i32T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.avg_pool2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i8T, i32T, i8T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i8T, i32T, i8T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp32T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.conv3d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.depthwise_conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.matmul",
-     {{{Profile::pro_int}, {{i8T, i8T, i8T, i8T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i8T, i8T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp32T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.max_pool2d",
-     {{{Profile::pro_int}, {{i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose_conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.clamp",
-     {{{Profile::pro_int}, {{i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.erf", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.sigmoid", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.tanh", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.erf",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sigmoid",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.tanh",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.add",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.arithmetic_right_shift",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_and",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_or",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_xor",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.intdiv",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_and",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_left_shift",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf}}},
     {"tosa.logical_right_shift",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf}}},
     {"tosa.logical_or",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_xor",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.maximum",
-     {{{Profile::pro_int}, {{i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.minimum",
-     {{{Profile::pro_int}, {{i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.mul",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T}, {i16T, i16T, i32T}}},
-      {{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pow",
-     {{{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.sub",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
-    {"tosa.table", {{{Profile::pro_int}, {{i8T, i8T, i8T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.table",
+     {{{Profile::pro_int}, {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.abs",
-     {{{Profile::pro_int}, {{i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_not",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}}}},
-    {"tosa.ceil", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.clz", {{{Profile::pro_int}, {{i32T, i32T}}}}},
-    {"tosa.cos", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.exp", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.floor", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.log", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.ceil",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.clz",
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.cos",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.exp",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.floor",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.log",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.logical_not",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.negate",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T, i8T},
-        {i16T, i16T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}},
+       {{{i8T, i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reciprocal",
-     {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.rsqrt", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.sin", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rsqrt",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sin",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.select",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
       {{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.equal",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater_equal",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_all",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.reduce_any",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.reduce_max",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_min",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_product",
-     {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_sum",
-     {{{Profile::pro_int}, {{i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
       {{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.gather",
      {{{Profile::pro_int},
-       {{i8T, i32T, i8T}, {i16T, i32T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, i32T, fp16T}, {fp32T, i32T, fp32T}}}}},
+       {{{i8T, i32T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, i32T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.scatter",
      {{{Profile::pro_int},
-       {{i8T, i32T, i8T, i8T},
-        {i16T, i32T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}},
+       {{{i8T, i32T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, i32T, fp16T, fp16T}, {fp32T, i32T, fp32T, fp32T}}}}},
+       {{{fp16T, i32T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.resize",
-     {{{Profile::pro_int}, {{i8T, i32T}, {i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.cast",
      {{{Profile::pro_int},
-       {{boolT, i8T},
-        {boolT, i16T},
-        {boolT, i32T},
-        {i8T, boolT},
-        {i8T, i16T},
-        {i8T, i32T},
-        {i16T, boolT},
-        {i16T, i8T},
-        {i16T, i32T},
-        {i32T, boolT},
-        {i32T, i8T},
-        {i32T, i16T}}},
-      {{Profile::pro_fp},
-       {{i8T, fp16T},
-        {i8T, fp32T},
-        {i16T, fp16T},
-        {i16T, fp32T},
-        {i32T, fp16T},
-        {i32T, fp32T},
-        {fp16T, i8T},
-        {fp16T, i16T},
-        {fp16T, i32T},
-        {fp16T, fp32T},
-        {fp32T, i8T},
-        {fp32T, i16T},
-        {fp32T, i32T},
-        {fp32T, fp16T}}}}},
+       {{{boolT, i8T}, SpecificationVersion::V_1_0},
+        {{boolT, i16T}, SpecificationVersion::V_1_0},
+        {{boolT, i32T}, SpecificationVersion::V_1_0},
+        {{i8T, boolT}, SpecificationVersion::V_1_0},
+        {{i8T, i16T}, SpecificationVersion::V_1_0},
+        {{i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, boolT}, SpecificationVersion::V_1_0},
+        {{i16T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T}, SpecificationVersion::V_1_0},
+        {{i32T, boolT}, SpecificationVersion::V_1_0},
+        {{i32T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{i8T, fp16T}, SpecificationVersion::V_1_0},
+        {{i8T, fp32T}, SpecificationVersion::V_1_0},
+        {{i16T, fp16T}, SpecificationVersion::V_1_0},
+        {{i16T, fp32T}, SpecificationVersion::V_1_0},
+        {{i32T, fp16T}, SpecificationVersion::V_1_0},
+        {{i32T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, i8T}, SpecificationVersion::V_1_0},
+        {{fp16T, i16T}, SpecificationVersion::V_1_0},
+        {{fp16T, i32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, i8T}, SpecificationVersion::V_1_0},
+        {{fp32T, i16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.rescale",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T, i8T},
-        {i8T, i8T, i16T, i16T},
-        {i8T, i8T, i32T, i32T},
-        {i16T, i16T, i8T, i8T},
-        {i16T, i16T, i16T, i16T},
-        {i16T, i16T, i32T, i32T},
-        {i32T, i32T, i8T, i8T},
-        {i32T, i32T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i32T, i32T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.const",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{boolT}, {i8T}, {i16T}, {i32T}},
+       {{{boolT}, SpecificationVersion::V_1_0},
+        {{i8T}, SpecificationVersion::V_1_0},
+        {{i16T}, SpecificationVersion::V_1_0},
+        {{i32T}, SpecificationVersion::V_1_0}},
        anyOf},
-      {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.identity",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{boolT, boolT}, {i8T, i8T}, {i16T, i16T}, {i32T, i32T}},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0},
+        {{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_write",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_read",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
 };
 
 extensionComplianceMap = {
     {"tosa.argmax",
-     {{{Extension::int16}, {{i16T, i32T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, i32T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T}}},
-      {{Extension::bf16}, {{bf16T, i32T}}}}},
+     {{{Extension::int16}, {{{i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3}, {{{fp8e4m3T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2}, {{{fp8e5m2T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.avg_pool2d",
-     {{{Extension::int16}, {{i16T, i16T, i16T, i32T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+     {{{Extension::int16},
+       {{{i16T, i16T, i16T, i32T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T},
+         SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T},
+         SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, fp32T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.conv3d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.depthwise_conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
-    {"tosa.fft2d", {{{Extension::fft}, {{fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
+    {"tosa.fft2d",
+     {{{Extension::fft},
+       {{{fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.matmul",
-     {{{Extension::int16}, {{i16T, i16T, i16T, i16T, i48T}}},
+     {{{Extension::int16},
+       {{{i16T, i16T, i16T, i16T, i48T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T},
-        {fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp32T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T},
-        {fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp32T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3, Extension::fp8e5m2},
-       {{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp16T},
-        {fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp32T},
-        {fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp16T},
-        {fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp32T}},
+       {{{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp16T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp16T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}},
        allOf},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T, bf16T, fp32T}}}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, bf16T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.max_pool2d",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.rfft2d", {{{Extension::fft}, {{fp32T, fp32T, fp32T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rfft2d",
+     {{{Extension::fft},
+       {{{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose_conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.clamp",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.erf", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.sigmoid", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.tanh", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.add", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.maximum", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.minimum", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.mul", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.pow", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.sub", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.table", {{{Extension::int16}, {{i16T, i16T, i32T}}}}},
-    {"tosa.abs", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.ceil", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.cos", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.exp", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.floor", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.log", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.negate", {{{Extension::bf16}, {{bf16T, bf16T, bf16T, bf16T}}}}},
-    {"tosa.reciprocal", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.rsqrt", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.sin", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.select", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.equal", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.greater", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.greater_equal", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.reduce_max", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_min", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_product", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_sum", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.erf",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sigmoid",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.tanh",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.add",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.maximum",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.minimum",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.mul",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.pow",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sub",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.table",
+     {{{Extension::int16},
+       {{{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.abs",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.ceil",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.cos",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.exp",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.floor",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.log",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.negate",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reciprocal",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rsqrt",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sin",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.select",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.equal",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.greater",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.greater_equal",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_max",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_min",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_product",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_sum",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.gather",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, i32T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, i32T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, i32T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, i32T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, i32T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.scatter",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, i32T, bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, i32T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.resize",
-     {{{Extension::int16}, {{i16T, i48T}, {i16T, i16T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16},
+       {{{i16T, i48T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.cast",
      {{{Extension::bf16},
-       {{i8T, bf16T},
-        {i16T, bf16T},
-        {i32T, bf16T},
-        {bf16T, i8T},
-        {bf16T, i16T},
-        {bf16T, i32T},
-        {bf16T, fp32T},
-        {fp32T, bf16T}}},
+       {{{i8T, bf16T}, SpecificationVersion::V_1_0},
+        {{i16T, bf16T}, SpecificationVersion::V_1_0},
+        {{i32T, bf16T}, SpecificationVersion::V_1_0},
+        {{bf16T, i8T}, SpecificationVersion::V_1_0},
+        {{bf16T, i16T}, SpecificationVersion::V_1_0},
+        {{bf16T, i32T}, SpecificationVersion::V_1_0},
+        {{bf16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, bf16T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16, Extension::fp8e4m3},
-       {{bf16T, fp8e4m3T}, {fp8e4m3T, bf16T}},
+       {{{bf16T, fp8e4m3T}, SpecificationVersion::V_1_0},
+        {{fp8e4m3T, bf16T}, SpecificationVersion::V_1_0}},
        allOf},
       {{Extension::bf16, Extension::fp8e5m2},
-       {{bf16T, fp8e5m2T}, {fp8e5m2T, bf16T}},
+       {{{bf16T, fp8e5m2T}, SpecificationVersion::V_1_0},
+        {{fp8e5m2T, bf16T}, SpecificationVersion::V_1_0}},
        allOf},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp16T},
-        {fp8e4m3T, fp32T},
-        {fp16T, fp8e4m3T},
-        {fp32T, fp8e4m3T}}},
+       {{{fp8e4m3T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp8e4m3T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp8e4m3T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp16T},
-        {fp8e5m2T, fp32T},
-        {fp16T, fp8e5m2T},
-        {fp32T, fp8e5m2T}}}}},
+       {{{fp8e5m2T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp8e5m2T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp8e5m2T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp8e5m2T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.rescale",
      {{{Extension::int16},
-       {{i48T, i48T, i8T, i8T},
-        {i48T, i48T, i16T, i16T},
-        {i48T, i48T, i32T, i32T}}}}},
+       {{{i48T, i48T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i48T, i48T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i48T, i48T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.const",
-     {{{Extension::int4}, {{i4T}}},
-      {{Extension::int16}, {{i48T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T}}}}},
+     {{{Extension::int4}, {{{i4T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16}, {{{i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3}, {{{fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2}, {{{fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.identity",
-     {{{Extension::int4}, {{i4T, i4T}}},
-      {{Extension::int16}, {{i48T, i48T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.variable", {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::int4}, {{{i4T, i4T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16}, {{{i48T, i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.variable",
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_write",
-     {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_read",
-     {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
 };
+
 // End of auto-generated metadata
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 38cb2936ad8d9..8376a4c87dbf2 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -221,7 +221,7 @@ class Tosa_I32EnumAttr<string name, string description, string mnemonic,
 }
 
 //===----------------------------------------------------------------------===//
-// TOSA Spec Section 1.5.
+// TOSA Profiles and extensions
 //
 // Profile:
 // INT : Integer Inference. Integer operations, primarily 8 and 32-bit values.
@@ -293,12 +293,6 @@ def Tosa_ExtensionAttr
 def Tosa_ExtensionArrayAttr
     : TypedArrayAttrBase<Tosa_ExtensionAttr, "TOSA extension array attribute">;
 
-def Tosa_LVL_NONE : I32EnumAttrCase<"none", 0>;
-def Tosa_LVL_8K   : I32EnumAttrCase<"eightK", 1, "8k">;
-
-def Tosa_LevelAttr
-    : Tosa_I32EnumAttr<"Level", "supported TOSA levels", "level", [Tosa_LVL_NONE, Tosa_LVL_8K]>;
-
 // The base class for defining op availability dimensions.
 class Availability {
   // The following are fields for controlling the generated C++ OpInterface.
@@ -404,18 +398,41 @@ class Extension<list<I32EnumAttrCase> extensions> : Availability {
   let instance = "ref";
 }
 
+//===----------------------------------------------------------------------===//
+// TOSA Levels
+//===----------------------------------------------------------------------===//
+
+def Tosa_LVL_NONE : I32EnumAttrCase<"none", 0>;
+def Tosa_LVL_8K   : I32EnumAttrCase<"eightK", 1, "8k">;
+
+def Tosa_LevelAttr
+    : Tosa_I32EnumAttr<"Level", "supported TOSA levels", "level", [Tosa_LVL_NONE, Tosa_LVL_8K]>;
+
+//===----------------------------------------------------------------------===//
+// TOSA Specification versions
+//===----------------------------------------------------------------------===//
+
+def Tosa_V_1_0 : I32EnumAttrCase<"V_1_0", 0, "1.0">;
+def Tosa_V_1_1_DRAFT : I32EnumAttrCase<"V_1_1_DRAFT", 1, "1.1.draft">;
+
+def Tosa_SpecificationVersion : Tosa_I32EnumAttr<
+      "SpecificationVersion", "TOSA specification version", "specification_version",
+      [Tosa_V_1_0, Tosa_V_1_1_DRAFT]>;
+
 //===----------------------------------------------------------------------===//
 // TOSA target environment.
 //===----------------------------------------------------------------------===//
 def Tosa_TargetEnv : Tosa_Attr<"TargetEnv", "target_env"> {
   let summary = "Target environment information.";
   let parameters = ( ins
+    "SpecificationVersion": $specification_version,
     "Level": $level,
     ArrayRefParameter<"Profile">: $profiles,
     ArrayRefParameter<"Extension">: $extensions
   );
 
-  let assemblyFormat = "`<` `level` `=` $level `,` `profiles` `=` `[` $profiles `]` `,` "
+  let assemblyFormat = "`<` `specification_version` `=` $specification_version `,` "
+                       "`level` `=` $level `,` `profiles` `=` `[` $profiles `]` `,` "
                        "`extensions` `=` `[` $extensions `]` `>`";
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index 8f5c72bc5f7a9..7b946ad6c6a89 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -36,12 +36,15 @@ enum CheckCondition {
   allOf
 };
 
+using VersionedTypeInfo =
+    std::pair<SmallVector<TypeInfo>, SpecificationVersion>;
+
 template <typename T>
 struct OpComplianceInfo {
   // Certain operations require multiple modes enabled.
   // e.g. cast bf16 to fp8e4m3 requires EXT-BF16 and EXT-FP8E4M3.
   SmallVector<T> mode;
-  SmallVector<SmallVector<TypeInfo>> operandTypeInfoSet;
+  SmallVector<VersionedTypeInfo> operandTypeInfoSet;
   CheckCondition condition = CheckCondition::anyOf;
 };
 
@@ -130,9 +133,8 @@ class TosaProfileCompliance {
   // Find the required profiles or extensions from the compliance info according
   // to the operand type combination.
   template <typename T>
-  SmallVector<T> findMatchedProfile(Operation *op,
-                                    SmallVector<OpComplianceInfo<T>> compInfo,
-                                    CheckCondition &condition);
+  OpComplianceInfo<T>
+  findMatchedEntry(Operation *op, SmallVector<OpComplianceInfo<T>> compInfo);
 
   SmallVector<Profile> getCooperativeProfiles(Extension ext) {
     switch (ext) {
@@ -168,8 +170,7 @@ class TosaProfileCompliance {
 
 private:
   template <typename T>
-  FailureOr<SmallVector<T>> getOperatorDefinition(Operation *op,
-                                                  CheckCondition &condition);
+  FailureOr<OpComplianceInfo<T>> getOperatorDefinition(Operation *op);
 
   OperationProfileComplianceMap profileComplianceMap;
   OperationExtensionComplianceMap extensionComplianceMap;
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index 6ae19d81e0820..14b00b04ccc18 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -137,6 +137,13 @@ def TosaAttachTarget : Pass<"tosa-attach-target", "ModuleOp"> {
   ];
 
   let options = [
+    Option<"specificationVersion", "specification_version", "mlir::tosa::SpecificationVersion",
+              /*default=*/"mlir::tosa::SpecificationVersion::V_1_0",
+              "The specification version that TOSA operators should conform to.",
+              [{::llvm::cl::values(
+                clEnumValN(mlir::tosa::SpecificationVersion::V_1_0, "1.0", "TOSA Specification version 1.0"),
+                clEnumValN(mlir::tosa::SpecificationVersion::V_1_1_DRAFT, "1.1.draft", "TOSA Specification version 1.1.draft")
+              )}]>,
     Option<"level", "level", "mlir::tosa::Level",
               /*default=*/"mlir::tosa::Level::eightK",
               "The TOSA level that operators should conform to. A TOSA level defines "
diff --git a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
index 5aad67173cc61..1cba1bb540c02 100644
--- a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Tosa/IR/TargetEnv.h"
+#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 namespace tosa {
@@ -27,7 +28,7 @@ TargetEnvAttr lookupTargetEnv(Operation *op) {
 }
 
 TargetEnvAttr getDefaultTargetEnv(MLIRContext *context) {
-  return TargetEnvAttr::get(context, Level::eightK,
+  return TargetEnvAttr::get(context, SpecificationVersion::V_1_0, Level::eightK,
                             {Profile::pro_int, Profile::pro_fp}, {});
 }
 
@@ -38,5 +39,9 @@ TargetEnvAttr lookupTargetEnvOrDefault(Operation *op) {
   return getDefaultTargetEnv(op->getContext());
 }
 
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version) {
+  return llvm::formatv("{0}.{1}", version.getMajor(), version.getMinor());
+}
+
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
index bcb880a808b36..a0661e4ee0bd2 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
@@ -61,8 +61,8 @@ class TosaAttachTarget
 
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
-    const auto targetEnvAttr =
-        TargetEnvAttr::get(ctx, level, selectedProfiles, selectedExtensions);
+    const auto targetEnvAttr = TargetEnvAttr::get(
+        ctx, specificationVersion, level, selectedProfiles, selectedExtensions);
     mod->setAttr(TargetEnvAttr::name, targetEnvAttr);
   }
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index 20f9333e7c951..f072e3eff1975 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -335,16 +335,15 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 template <typename T>
-FailureOr<SmallVector<T>>
-TosaProfileCompliance::getOperatorDefinition(Operation *op,
-                                             CheckCondition &condition) {
+FailureOr<OpComplianceInfo<T>>
+TosaProfileCompliance::getOperatorDefinition(Operation *op) {
   const std::string opName = op->getName().getStringRef().str();
   const auto complianceMap = getProfileComplianceMap<T>();
   const auto it = complianceMap.find(opName);
   if (it == complianceMap.end())
     return {};
 
-  return findMatchedProfile<T>(op, it->second, condition);
+  return findMatchedEntry<T>(op, it->second);
 }
 
 template <typename T>
@@ -356,22 +355,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
   if (specRequiredModeSet.size() == 0)
     return success();
 
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeOpRequiredMode = getOperatorDefinition<T>(op, condition);
-  if (failed(maybeOpRequiredMode)) {
+  const auto maybeOpDefinition = getOperatorDefinition<T>(op);
+  if (failed(maybeOpDefinition)) {
     // Operators such as control-flow and shape ops do not have an operand type
     // restriction. When the profile compliance information of operation is not
     // found, confirm if the target have enabled the profile required from the
     // specification.
-    int mode_count = 0;
+    int modeCount = 0;
     for (const auto &cands : specRequiredModeSet) {
       if (targetEnv.allowsAnyOf(cands))
         return success();
-      mode_count += cands.size();
+      modeCount += cands.size();
     }
 
     op->emitOpError() << "illegal: requires"
-                      << (mode_count > 1 ? " any of " : " ") << "["
+                      << (modeCount > 1 ? " any of " : " ") << "["
                       << llvm::join(stringifyProfile<T>(specRequiredModeSet),
                                     ", ")
                       << "] but not enabled in target\n";
@@ -381,7 +379,10 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
 
   // Find the required profiles or extensions according to the operand type
   // combination.
-  const auto opRequiredMode = maybeOpRequiredMode.value();
+  const auto opDefinition = maybeOpDefinition.value();
+  const SmallVector<T> opRequiredMode = opDefinition.mode;
+  const CheckCondition condition = opDefinition.condition;
+
   if (opRequiredMode.size() == 0) {
     // No matched restriction found.
     return success();
@@ -437,6 +438,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
     }
   }
 
+  // Ensure the matched op compliance version does not exceed the target
+  // specification version.
+  const VersionedTypeInfo versionedTypeInfo =
+      opDefinition.operandTypeInfoSet[0];
+  const TosaSpecificationVersion complianceVersion{versionedTypeInfo.second};
+  const TosaSpecificationVersion targetVersion{targetEnv.getSpecVersion()};
+  if (!targetVersion.isBackwardsCompatibleWith(complianceVersion)) {
+    op->emitOpError() << "illegal: the target specification version ("
+                      << stringifyVersion(targetVersion)
+                      << ") is not backwards compatible with the op compliance "
+                         "specification version ("
+                      << stringifyVersion(complianceVersion) << ")\n";
+    return failure();
+  }
+
   return success();
 }
 
@@ -461,14 +477,14 @@ TosaProfileCompliance::checkExtension(Operation *op,
 }
 
 LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeProfDef = getOperatorDefinition<Profile>(op, condition);
-  const auto maybeExtDef = getOperatorDefinition<Extension>(op, condition);
+  const auto maybeProfDef = getOperatorDefinition<Profile>(op);
+  const auto maybeExtDef = getOperatorDefinition<Extension>(op);
   if (failed(maybeProfDef) && failed(maybeExtDef))
     return success();
 
-  const bool hasEntry = (succeeded(maybeProfDef) && !maybeProfDef->empty()) ||
-                        (succeeded(maybeExtDef) && !maybeExtDef->empty());
+  const bool hasEntry =
+      (succeeded(maybeProfDef) && !maybeProfDef->mode.empty()) ||
+      (succeeded(maybeExtDef) && !maybeExtDef->mode.empty());
   if (!hasEntry) {
     std::string message;
     llvm::raw_string_ostream os(message);
@@ -488,7 +504,9 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
     SmallVector<TypeInfo> bestTypeInfo;
     const auto searchBestMatch = [&](auto map) {
       for (const auto &complianceInfos : map[opName]) {
-        for (const auto &typeInfos : complianceInfos.operandTypeInfoSet) {
+        for (const auto &versionedTypeInfos :
+             complianceInfos.operandTypeInfoSet) {
+          const SmallVector<TypeInfo> typeInfos = versionedTypeInfos.first;
           const int matches = llvm::count_if(
               llvm::zip_equal(current, typeInfos), [&](const auto zipType) {
                 return isSameTypeInfo(std::get<0>(zipType),
@@ -520,9 +538,8 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
 // Find the profiles or extensions requirement according to the signature of
 // type of the operand list.
 template <typename T>
-SmallVector<T> TosaProfileCompliance::findMatchedProfile(
-    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo,
-    CheckCondition &condition) {
+OpComplianceInfo<T> TosaProfileCompliance::findMatchedEntry(
+    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo) {
   assert(compInfo.size() != 0 &&
          "profile-based compliance information is empty");
 
@@ -533,27 +550,30 @@ SmallVector<T> TosaProfileCompliance::findMatchedProfile(
     return {};
 
   for (size_t i = 0; i < compInfo.size(); i++) {
-    SmallVector<SmallVector<TypeInfo>> sets = compInfo[i].operandTypeInfoSet;
-    for (SmallVector<TypeInfo> expected : sets) {
+    SmallVector<VersionedTypeInfo> sets = compInfo[i].operandTypeInfoSet;
+    for (const auto &set : sets) {
+      SmallVector<TypeInfo> expected = set.first;
       assert(present.size() == expected.size() &&
              "the entries for profile-based compliance do not match between "
              "the generated metadata and the type definition retrieved from "
              " the operation");
 
-      bool is_found = true;
+      bool isFound = true;
       // Compare the type signature between the given operation and the
       // compliance metadata.
       for (size_t j = 0; j < expected.size(); j++) {
         if (!isSameTypeInfo(present[j], expected[j])) {
           // Verify the next mode set from the list.
-          is_found = false;
+          isFound = false;
           break;
         }
       }
 
-      if (is_found == true) {
-        condition = compInfo[i].condition;
-        return compInfo[i].mode;
+      if (isFound == true) {
+        SmallVector<VersionedTypeInfo> typeInfoSet{set};
+        OpComplianceInfo<T> info{compInfo[i].mode, typeInfoSet,
+                                 compInfo[i].condition};
+        return info;
       }
     }
   }
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 9603813e059d3..857e31be6f259 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -2604,6 +2604,7 @@ static constexpr std::array kExplicitLLVMFuncOpAttributes{
     StringLiteral("denormal-fp-math-f32"),
     StringLiteral("fp-contract"),
     StringLiteral("frame-pointer"),
+    StringLiteral("inlinehint"),
     StringLiteral("instrument-function-entry"),
     StringLiteral("instrument-function-exit"),
     StringLiteral("memory"),
@@ -2643,6 +2644,8 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func,
     funcOp.setNoInline(true);
   if (func->hasFnAttribute(llvm::Attribute::AlwaysInline))
     funcOp.setAlwaysInline(true);
+  if (func->hasFnAttribute(llvm::Attribute::InlineHint))
+    funcOp.setInlineHint(true);
   if (func->hasFnAttribute(llvm::Attribute::OptimizeNone))
     funcOp.setOptimizeNone(true);
   if (func->hasFnAttribute(llvm::Attribute::Convergent))
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 845a14f34c016..147613f96b884 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1652,6 +1652,8 @@ static void convertFunctionAttributes(LLVMFuncOp func,
     llvmFunc->addFnAttr(llvm::Attribute::NoInline);
   if (func.getAlwaysInlineAttr())
     llvmFunc->addFnAttr(llvm::Attribute::AlwaysInline);
+  if (func.getInlineHintAttr())
+    llvmFunc->addFnAttr(llvm::Attribute::InlineHint);
   if (func.getOptimizeNoneAttr())
     llvmFunc->addFnAttr(llvm::Attribute::OptimizeNone);
   if (func.getConvergentAttr())
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index 35f520a9f22a8..93a03369be239 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -1,5 +1,9 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.dot
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_dot
 func.func @contraction_dot(%A: memref<1584xf32>, %B: memref<1584xf32>, %C: memref<f32>) {
 
@@ -20,6 +24,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matvec
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_matvec
 func.func @contraction_matvec(%A: memref<1584x1584xf32>, %B: memref<1584xf32>, %C: memref<1584xf32>) {
 
@@ -41,6 +49,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matmul
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_matmul
 func.func @contraction_matmul(%A: memref<1584x1584xf32>, %B: memref<1584x1584xf32>, %C: memref<1584x1584xf32>) {
 // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584xf32>
@@ -138,6 +150,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.batch_matmul
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_batch_matmul
 func.func @contraction_batch_matmul(%A: memref<1584x1584x1584xf32>, %B: memref<1584x1584x1584xf32>, %C: memref<1584x1584x1584xf32>) {
 // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584x1584xf32>
@@ -159,6 +175,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.cantract
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: @matmul_as_contract
 // CHECK-SAME: %[[A:.*]]: tensor<24x12xf32>
 // CHECK-SAME: %[[B:.*]]: tensor<12x25xf32>
@@ -220,6 +240,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.fill
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: func @test_vectorize_fill
 func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) {
   //       CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32>
@@ -259,70 +283,14 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_copy
-func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
-  //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
-  //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
-  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
-  return
-}
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.pack
+///----------------------------------------------------------------------------------------
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
+// Note, see a similar test in:
+//  * vectorization.mlir.
 
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_0d
-func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
-  //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
-  //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
-  //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
-  //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
-  //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
-  memref.copy %A, %B :  memref<f32> to memref<f32>
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_complex
-// CHECK-NOT: vector<
-func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
-  memref.copy %A, %B :  memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// Input identical as the test in vectorization.mlir. Output is different -
-// vector sizes are inferred (rather than user-specified) and hence _no_
-// masking was used.
-
-func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+func.func @pack_no_padding(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
   %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
@@ -336,7 +304,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// CHECK-LABEL:   func.func @test_vectorize_pack(
+// CHECK-LABEL:   func.func @pack_no_padding(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x8x16xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = ub.poison : f32
@@ -349,13 +317,16 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// Note, see a similar test in:
+//  * vectorization.mlir.
+
+func.func @pack_with_padding(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
   %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
 
-// CHECK-LABEL:   func.func @test_vectorize_padded_pack(
+// CHECK-LABEL:   func.func @pack_with_padding(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x7x15xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
@@ -377,6 +348,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.map
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_map(%arg0: memref<64xf32>,
     %arg1: memref<64xf32>, %arg2: memref<64xf32>) {
   linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>)
@@ -403,6 +378,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.transpose
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>,
                                %arg1: memref<32x64x16xf32>) {
   linalg.transpose ins(%arg0 : memref<16x32x64xf32>)
@@ -424,6 +403,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.reduce
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>,
                   %arg1: memref<16x64xf32>) {
   linalg.reduce ins(%arg0 : memref<16x32x64xf32>)
@@ -449,6 +432,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.generic
+///----------------------------------------------------------------------------------------
+
 #matmul_trait = {
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
@@ -1446,6 +1433,8 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// TODO: Two Linalg Ops in one tests - either split or document "why".
+
 // CHECK-DAG: #[[$M6:.*]] = affine_map<(d0, d1) -> (d0, 0)>
 
 // CHECK-LABEL:   func @fused_broadcast_red_2d
@@ -1896,3 +1885,65 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
+// -----
+
+///----------------------------------------------------------------------------------------
+/// Tests for memref.copy
+///----------------------------------------------------------------------------------------
+
+// CHECK-LABEL: func @test_vectorize_copy
+func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
+  //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
+  //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_0d
+func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
+  //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
+  //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
+  //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
+  //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
+  //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
+  memref.copy %A, %B :  memref<f32> to memref<f32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_complex
+// CHECK-NOT: vector<
+func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
+  memref.copy %A, %B :  memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 11bea8d92432c..1304a90349f71 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1307,14 +1307,17 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf
 /// Tests for linalg.pack
 ///----------------------------------------------------------------------------------------
 
-// Input identical as the test in vectorization-with-patterns.mlir. Output is
-// different - vector sizes are inferred (rather than user-specified) and hence
-// masking was used.
+// This packing requires no padding, so no out-of-bounds read/write vector Ops.
 
-// CHECK-LABEL: func @test_vectorize_pack
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir
+// The output is identical (the input vector sizes == the inferred vector
+// sizes, i.e. the tensor sizes).
+
+// CHECK-LABEL: func @pack_no_padding
 // CHECK-SAME:      %[[SRC:.*]]: tensor<32x8x16xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<4x1x32x16x2xf32>
-func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
   %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
@@ -1325,9 +1328,9 @@ func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x1
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
-//      CHECK: return %[[write]] : tensor<4x1x32x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<4x1x32x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
@@ -1339,14 +1342,18 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// Input identical as the test in vectorization-with-patterns.mlir. Output is
-// different - vector sizes are inferred (rather than user-specified) and hence
-// masking was used.
+// This packing does require padding, so there are out-of-bounds read/write
+// vector Ops.
+
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir.
+// The output is different (the input vector sizes != inferred vector sizes,
+// i.e. the tensor sizes).
 
-// CHECK-LABEL: func @test_vectorize_padded_pack
+// CHECK-LABEL: func @pack_with_padding
 // CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
-func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
   %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
@@ -1364,9 +1371,9 @@ func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<3
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1378,10 +1385,46 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_dynamic_pack
+// This packing does require padding, so there are out-of-bounds read/write
+// vector Ops.
+
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir.
+// The output is identical (in both cases the vector sizes are inferred).
+
+// CHECK-LABEL: func @pack_with_padding_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @pack_with_padding_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+  %pad = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  return %pack : tensor<32x4x1x16x2xf32>
+}
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
+// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @pack_with_dynamic_dims
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
-func.func @test_vectorize_dynamic_pack(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
   %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
@@ -1418,64 +1461,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// -----
-
-// CHECK-LABEL: func @test_vectorize_pack_no_vector_sizes
-// CHECK-SAME:      %[[SRC:.*]]: tensor<64x4xf32>,
-// CHECK-SAME:      %[[DEST:.*]]: tensor<2x4x16x2xf32>
-func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
-  %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
-  return %pack : tensor<2x4x16x2xf32>
-}
-//  CHECK-DAG: %[[CST:.*]] = ub.poison : f32
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]]
-// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
-//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<64x4xf32> to vector<4x16x2x2xf32>
-//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
-//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
-//      CHECK: return %[[WRITE]] : tensor<2x4x16x2xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
-// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
-// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
-func.func @test_vectorize_padded_pack_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
-  %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
-  return %pack : tensor<32x4x1x16x2xf32>
-}
-//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
-// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
-}
-
-
 ///----------------------------------------------------------------------------------------
 /// Tests for other Ops
 ///----------------------------------------------------------------------------------------
diff --git a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
index d6c886c44b013..a0c59c0c4bb3b 100644
--- a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
@@ -1,12 +1,14 @@
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target="profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround,dynamic level=none" | FileCheck %s --check-prefix=CHECK-ALL
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target="level=8k" | FileCheck %s --check-prefix=CHECK-LVL-8K
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: mlir-opt %s -split-input-file -tosa-attach-target="specification_version=1.1.draft" | FileCheck %s --check-prefix=CHECK-VERSION-1P1
 
 // -----
 
-// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>}
-// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>}
-// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>}
+// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>}
+// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>}
+// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>}
+// CHECK-VERSION-1P1: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.1.draft", level = "8k", profiles = [], extensions = []>}
 // CHECK-LABEL: test_simple
 func.func @test_simple(%arg0 : tensor<1x1x1x1xf32>, %arg1 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
   %1 = tosa.add %arg0, %arg1 : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir
new file mode 100644
index 0000000000000..51089df238b84
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.0 profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment"
+
+// -----
+
+func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2>
+  // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}}
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>)  -> tensor<1x14x28xf16>
+  return %0 : tensor<1x14x28xf16>
+}
+
+// -----
+
+func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}}
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
+  return %0 : tensor<1x14x28xf32>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
new file mode 100644
index 0000000000000..81645092bf195
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" | FileCheck %s
+
+// -----
+
+func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2>
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>)  -> tensor<1x14x28xf16>
+  return %0 : tensor<1x14x28xf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_fp8_input_fp32_acc_type
+func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
+  return %0 : tensor<1x14x28xf32>
+}
diff --git a/mlir/test/Target/LLVMIR/Import/function-attributes.ll b/mlir/test/Target/LLVMIR/Import/function-attributes.ll
index cc3d799bfc626..00d09baea393e 100644
--- a/mlir/test/Target/LLVMIR/Import/function-attributes.ll
+++ b/mlir/test/Target/LLVMIR/Import/function-attributes.ll
@@ -393,6 +393,12 @@ declare void @alwaysinline_attribute() alwaysinline
 
 // -----
 
+; CHECK-LABEL: @inlinehint_attribute
+; CHECK-SAME: attributes {inline_hint}
+declare void @inlinehint_attribute() inlinehint
+
+// -----
+
 ; CHECK-LABEL: @optnone_attribute
 ; CHECK-SAME: attributes {no_inline, optimize_none}
 declare void @optnone_attribute() noinline optnone
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 69814f2748e1d..cc243c86ca902 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -2555,6 +2555,17 @@ llvm.func @always_inline() attributes { always_inline } {
 
 // -----
 
+// CHECK-LABEL: @inline_hint
+// CHECK-SAME: #[[ATTRS:[0-9]+]]
+llvm.func @inline_hint() attributes { inline_hint } {
+  llvm.return
+}
+
+// CHECK: #[[ATTRS]]
+// CHECK-SAME: inlinehint
+
+// -----
+
 // CHECK-LABEL: @optimize_none
 // CHECK-SAME: #[[ATTRS:[0-9]+]]
 llvm.func @optimize_none() attributes { no_inline, optimize_none } {
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
index b3b72db080392..42d8ae43beba1 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
@@ -18,6 +18,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
index 440a6b533317d..09a4dc1577822 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
@@ -18,6 +18,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index 3e95791ce9a50..c53e69a25e50d 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -50,6 +50,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int main() {
   int host_var = 0;
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 2939af613b6dd..644df6e7b0339 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -16,6 +16,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
index d2e10d1a7acc6..c3b68a74056ac 100644
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -42,8 +42,6 @@ llvm::Pass *createJSONExporterPass();
 llvm::Pass *createJSONImporterPass();
 llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createPollyCanonicalizePass();
-llvm::Pass *createPolyhedralInfoPass();
-llvm::Pass *createPolyhedralInfoPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createScopDetectionWrapperPassPass();
 llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createScopInfoRegionPassPass();
@@ -98,8 +96,6 @@ struct PollyForcePassLinking {
     polly::createScopInfoWrapperPassPass();
     polly::createScopInfoPrinterLegacyFunctionPass(llvm::outs());
     polly::createPollyCanonicalizePass();
-    polly::createPolyhedralInfoPass();
-    polly::createPolyhedralInfoPrinterLegacyPass(llvm::outs());
     polly::createIslAstInfoWrapperPassPass();
     polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
     polly::createCodeGenerationPass();
@@ -155,8 +151,6 @@ void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeSimplifyWrapperPassPass(llvm::PassRegistry &);
 void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializePruneUnprofitableWrapperPassPass(llvm::PassRegistry &);
-void initializePolyhedralInfoPass(llvm::PassRegistry &);
-void initializePolyhedralInfoPrinterLegacyPassPass(llvm::PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/polly/include/polly/PolyhedralInfo.h b/polly/include/polly/PolyhedralInfo.h
deleted file mode 100644
index b7534cd6b8df7..0000000000000
--- a/polly/include/polly/PolyhedralInfo.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//===- polly/PolyhedralInfo.h - PolyhedralInfo class definition -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// This file contains the declaration of the PolyhedralInfo class, which will
-/// provide an interface to expose polyhedral analysis information of Polly.
-///
-/// This is work in progress. We will add more API's as and when deemed
-/// required.
-//===----------------------------------------------------------------------===///
-
-#ifndef POLLY_POLYHEDRAL_INFO_H
-#define POLLY_POLYHEDRAL_INFO_H
-
-#include "llvm/Pass.h"
-#include "isl/aff_type.h"
-#include "isl/ctx.h"
-#include "isl/union_map_type.h"
-
-namespace llvm {
-class Loop;
-} // namespace llvm
-
-namespace polly {
-
-class Scop;
-class ScopInfo;
-class DependenceInfoWrapperPass;
-
-class PolyhedralInfo final : public llvm::FunctionPass {
-public:
-  static char ID; // Pass identification, replacement for typeid
-
-  /// Construct a new PolyhedralInfo pass.
-  PolyhedralInfo() : FunctionPass(ID) {}
-  ~PolyhedralInfo() {}
-
-  /// Check if a given loop is parallel.
-  ///
-  /// @param L The loop.
-  ///
-  /// @return  Returns true, if loop is parallel false otherwise.
-  bool isParallel(llvm::Loop *L) const;
-
-  /// Return the SCoP containing the @p L loop.
-  ///
-  /// @param L The loop.
-  ///
-  /// @return  Returns the SCoP containing the given loop.
-  ///          Returns null if the loop is not contained in any SCoP.
-  const Scop *getScopContainingLoop(llvm::Loop *L) const;
-
-  /// Computes the partial schedule for the given @p L loop.
-  ///
-  /// @param S The SCoP containing the given loop
-  /// @param L The loop.
-  ///
-  /// @return  Returns the partial schedule for the given loop
-  __isl_give isl_union_map *getScheduleForLoop(const Scop *S,
-                                               llvm::Loop *L) const;
-
-  /// Get the SCoP and dependence analysis information for @p F.
-  bool runOnFunction(llvm::Function &F) override;
-
-  /// Release the internal memory.
-  void releaseMemory() override {}
-
-  /// Print to @p OS if each dimension of a loop nest is parallel or not.
-  void print(llvm::raw_ostream &OS,
-             const llvm::Module *M = nullptr) const override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override;
-
-private:
-  /// Check if a given loop is parallel or vectorizable.
-  ///
-  /// @param L             The loop.
-  /// @param MinDepDistPtr If not nullptr, the minimal dependence distance will
-  ///                      be returned at the address of that pointer
-  ///
-  /// @return  Returns true if loop is parallel or vectorizable, false
-  ///          otherwise.
-  bool checkParallel(llvm::Loop *L,
-                     __isl_give isl_pw_aff **MinDepDistPtr = nullptr) const;
-
-  ScopInfo *SI;
-  DependenceInfoWrapperPass *DI;
-};
-
-llvm::Pass *createPolyhedralInfoPrinterLegacyPass(llvm::raw_ostream &OS);
-} // end namespace polly
-
-namespace llvm {
-class PassRegistry;
-void initializePolyhedralInfoPass(llvm::PassRegistry &);
-void initializePolyhedralInfoPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
-#endif
diff --git a/polly/lib/Analysis/PolyhedralInfo.cpp b/polly/lib/Analysis/PolyhedralInfo.cpp
deleted file mode 100644
index 8d8e81a9049df..0000000000000
--- a/polly/lib/Analysis/PolyhedralInfo.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-//===--------- PolyhedralInfo.cpp  - Create Scops from LLVM IR-------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// An interface to the Polyhedral analysis engine(Polly) of LLVM.
-//
-// This pass provides an interface to the polyhedral analysis performed by
-// Polly.
-//
-// This interface provides basic interface like isParallel, isVectorizable
-// that can be used in LLVM transformation passes.
-//
-// Work in progress, this file is subject to change.
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/PolyhedralInfo.h"
-#include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
-#include "polly/Options.h"
-#include "polly/ScopInfo.h"
-#include "polly/Support/GICHelper.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "isl/union_map.h"
-
-using namespace llvm;
-using namespace polly;
-
-#include "polly/Support/PollyDebug.h"
-#define DEBUG_TYPE "polyhedral-info"
-
-static cl::opt<bool> CheckParallel("polly-check-parallel",
-                                   cl::desc("Check for parallel loops"),
-                                   cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool> CheckVectorizable("polly-check-vectorizable",
-                                       cl::desc("Check for vectorizable loops"),
-                                       cl::Hidden, cl::cat(PollyCategory));
-
-void PolyhedralInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequiredTransitive<DependenceInfoWrapperPass>();
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequiredTransitive<ScopInfoWrapperPass>();
-  AU.setPreservesAll();
-}
-
-bool PolyhedralInfo::runOnFunction(Function &F) {
-  DI = &getAnalysis<DependenceInfoWrapperPass>();
-  SI = getAnalysis<ScopInfoWrapperPass>().getSI();
-  return false;
-}
-
-void PolyhedralInfo::print(raw_ostream &OS, const Module *) const {
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  for (auto *TopLevelLoop : LI) {
-    for (auto *L : depth_first(TopLevelLoop)) {
-      OS.indent(2) << L->getHeader()->getName() << ":\t";
-      if (CheckParallel && isParallel(L))
-        OS << "Loop is parallel.\n";
-      else if (CheckParallel)
-        OS << "Loop is not parallel.\n";
-    }
-  }
-}
-
-bool PolyhedralInfo::checkParallel(Loop *L, isl_pw_aff **MinDepDistPtr) const {
-  bool IsParallel;
-  const Scop *S = getScopContainingLoop(L);
-  if (!S)
-    return false;
-  const Dependences &D =
-      DI->getDependences(const_cast<Scop *>(S), Dependences::AL_Access);
-  if (!D.hasValidDependences())
-    return false;
-  POLLY_DEBUG(dbgs() << "Loop :\t" << L->getHeader()->getName() << ":\n");
-
-  isl_union_map *Deps =
-      D.getDependences(Dependences::TYPE_RAW | Dependences::TYPE_WAW |
-                       Dependences::TYPE_WAR | Dependences::TYPE_RED)
-          .release();
-
-  POLLY_DEBUG(dbgs() << "Dependences :\t" << stringFromIslObj(Deps, "null")
-                     << "\n");
-
-  isl_union_map *Schedule = getScheduleForLoop(S, L);
-  POLLY_DEBUG(dbgs() << "Schedule: \t" << stringFromIslObj(Schedule, "null")
-                     << "\n");
-
-  IsParallel = D.isParallel(Schedule, Deps, MinDepDistPtr);
-  isl_union_map_free(Schedule);
-  return IsParallel;
-}
-
-bool PolyhedralInfo::isParallel(Loop *L) const { return checkParallel(L); }
-
-const Scop *PolyhedralInfo::getScopContainingLoop(Loop *L) const {
-  assert((SI) && "ScopInfoWrapperPass is required by PolyhedralInfo pass!\n");
-  for (auto &It : *SI) {
-    Region *R = It.first;
-    if (R->contains(L))
-      return It.second.get();
-  }
-  return nullptr;
-}
-
-//  Given a Loop and the containing SCoP, we compute the partial schedule
-//  by taking union of individual schedules of each ScopStmt within the loop
-//  and projecting out the inner dimensions from the range of the schedule.
-//   for (i = 0; i < n; i++)
-//      for (j = 0; j < n; j++)
-//        A[j] = 1;  //Stmt
-//
-//  The original schedule will be
-//    Stmt[i0, i1] -> [i0, i1]
-//  The schedule for the outer loop will be
-//    Stmt[i0, i1] -> [i0]
-//  The schedule for the inner loop will be
-//    Stmt[i0, i1] -> [i0, i1]
-__isl_give isl_union_map *PolyhedralInfo::getScheduleForLoop(const Scop *S,
-                                                             Loop *L) const {
-  isl_union_map *Schedule = isl_union_map_empty(S->getParamSpace().release());
-  int CurrDim = S->getRelativeLoopDepth(L);
-  POLLY_DEBUG(dbgs() << "Relative loop depth:\t" << CurrDim << "\n");
-  assert(CurrDim >= 0 && "Loop in region should have at least depth one");
-
-  for (auto &SS : *S) {
-    if (L->contains(SS.getSurroundingLoop())) {
-
-      unsigned int MaxDim = SS.getNumIterators();
-      POLLY_DEBUG(dbgs() << "Maximum depth of Stmt:\t" << MaxDim << "\n");
-      isl_map *ScheduleMap = SS.getSchedule().release();
-      assert(
-          ScheduleMap &&
-          "Schedules that contain extension nodes require special handling.");
-
-      ScheduleMap = isl_map_project_out(ScheduleMap, isl_dim_out, CurrDim + 1,
-                                        MaxDim - CurrDim - 1);
-      ScheduleMap = isl_map_set_tuple_id(ScheduleMap, isl_dim_in,
-                                         SS.getDomainId().release());
-      Schedule =
-          isl_union_map_union(Schedule, isl_union_map_from_map(ScheduleMap));
-    }
-  }
-  Schedule = isl_union_map_coalesce(Schedule);
-  return Schedule;
-}
-
-char PolyhedralInfo::ID = 0;
-
-Pass *polly::createPolyhedralInfoPass() { return new PolyhedralInfo(); }
-
-INITIALIZE_PASS_BEGIN(PolyhedralInfo, "polyhedral-info",
-                      "Polly - Interface to polyhedral analysis engine", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass);
-INITIALIZE_PASS_END(PolyhedralInfo, "polyhedral-info",
-                    "Polly - Interface to polyhedral analysis engine", false,
-                    false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from PolyhedralInfo.
-class PolyhedralInfoPrinterLegacyPass final : public FunctionPass {
-public:
-  static char ID;
-
-  PolyhedralInfoPrinterLegacyPass() : PolyhedralInfoPrinterLegacyPass(outs()) {}
-  explicit PolyhedralInfoPrinterLegacyPass(llvm::raw_ostream &OS)
-      : FunctionPass(ID), OS(OS) {}
-
-  bool runOnFunction(Function &F) override {
-    PolyhedralInfo &P = getAnalysis<PolyhedralInfo>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for function '"
-       << F.getName() << "':\n";
-    P.print(OS);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    FunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<PolyhedralInfo>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char PolyhedralInfoPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createPolyhedralInfoPrinterLegacyPass(raw_ostream &OS) {
-  return new PolyhedralInfoPrinterLegacyPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(
-    PolyhedralInfoPrinterLegacyPass, "print-polyhedral-info",
-    "Polly - Print interface to polyhedral analysis engine analysis", false,
-    false);
-INITIALIZE_PASS_DEPENDENCY(PolyhedralInfo);
-INITIALIZE_PASS_END(
-    PolyhedralInfoPrinterLegacyPass, "print-polyhedral-info",
-    "Polly - Print interface to polyhedral analysis engine analysis", false,
-    false)
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
index d91f4ecd37e6c..0ed673815ff34 100644
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -43,7 +43,6 @@ add_llvm_pass_plugin(Polly
   NO_MODULE
   SUBPROJECT Polly
   Analysis/DependenceInfo.cpp
-  Analysis/PolyhedralInfo.cpp
   Analysis/ScopDetection.cpp
   Analysis/ScopDetectionDiagnostic.cpp
   Analysis/ScopInfo.cpp
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 56cb8aadce3b6..0420dff944f62 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -30,7 +30,6 @@
 #include "polly/JSONExporter.h"
 #include "polly/LinkAllPasses.h"
 #include "polly/MaximalStaticExpansion.h"
-#include "polly/PolyhedralInfo.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/ScheduleOptimizer.h"
 #include "polly/ScopDetection.h"
@@ -232,8 +231,6 @@ void initializePollyPasses(llvm::PassRegistry &Registry) {
   initializeIslScheduleOptimizerWrapperPassPass(Registry);
   initializeIslScheduleOptimizerPrinterLegacyPassPass(Registry);
   initializePollyCanonicalizePass(Registry);
-  initializePolyhedralInfoPass(Registry);
-  initializePolyhedralInfoPrinterLegacyPassPass(Registry);
   initializeScopDetectionWrapperPassPass(Registry);
   initializeScopDetectionPrinterLegacyPassPass(Registry);
   initializeScopInlinerPass(Registry);
diff --git a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
index 00f18aebbcd5a..ec1ccdce94508 100644
--- a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
@@ -1,13 +1,10 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;       void jd(int *A) {
 ; CHECK:  #pragma omp parallel for
-; PINFO:  for.cond2: Loop is parallel.
 ;         for (int i = 0; i < 1024; i++)
 ;           A[i] = 1;
 ; CHECK:  #pragma omp parallel for
-; PINFO:  for.cond: Loop is parallel.
 ;         for (int i = 0; i < 1024; i++)
 ;           A[i] = A[i] * 2;
 ;       }
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
index bcb35cb4b07c1..9c00690605408 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < 1024; i++)
@@ -50,6 +49,3 @@ ret:
 ; CHECK-NOT:   #pragma omp parallel for
 ; CHECK:       for (int c1 = 0; c1 <= 1023; c1 += 1)
 ; CHECK:         Stmt_loop_body(c0, c1);
-;
-; PINFO:      loop.i: Loop is parallel.
-; PINFO-NEXT: loop.j: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
index e2ff5d5756b13..356762a2ae5b9 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; int A[1024][1024];
 ; void bar(int n) {
@@ -47,6 +46,3 @@ ret:
 ; CHECK:     #pragma simd
 ; CHECK:     for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:       Stmt_loop_body(c0, c1);
-
-; PINFO:      loop.i: Loop is parallel.
-; PINFO-NEXT: loop.j: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
index 17ef7fe6f251d..066fc39def6ac 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output< %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -45,6 +44,3 @@ ret:
 ; CHECK:   #pragma omp parallel for
 ; CHECK:   for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:     Stmt_loop_body(c0, c1);
-
-; PINFO:      loop.i: Loop is not parallel.
-; PINFO-NEXT: loop.j: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
index bc381e2c87fdb..77dd55cb7605e 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -44,6 +43,3 @@ ret:
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:     Stmt_loop_body(c0, c1);
-
-; PINFO:      loop.i: Loop is parallel.
-; PINFO-NEXT: loop.j: Loop is not parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
index ee02dafeedeb1..b61ebc9379b7f 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -31,4 +30,3 @@ ret:
 
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   Stmt_loop_body(c0)
-; PINFO: loop.header: Loop is not parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
index a5831302471ee..5c92a91681867 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -34,4 +33,3 @@ ret:
 ; CHECK: #pragma omp parallel for
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   Stmt_loop_body(c0)
-; PINFO: loop.header: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
index 31a906ed403c8..352d879199675 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -34,4 +33,3 @@ ret:
 ; CHECK-NOT: #pragma omp parallel for
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   Stmt_loop_body(c0)
-; PINFO: loop.header: Loop is parallel.
diff --git a/polly/test/IstAstInfo/dependence_distance_constant.ll b/polly/test/IstAstInfo/dependence_distance_constant.ll
index 8b0e4d267c14d..9b7fb93f2f676 100644
--- a/polly/test/IstAstInfo/dependence_distance_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_constant.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *A, int N) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int j = 0; j < N; j++)
 ; CHECK:      #pragma minimal dependence distance: 8
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = 0; i < N; i++)
 ;               A[i + 8] = A[i] + 1;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
index 4dae80902457c..bc21e9e07ad89 100644
--- a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
@@ -1,9 +1,7 @@
 ; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *restrict A, int *restrict B, int N) {
 ; CHECK:   #pragma minimal dependence distance: 5
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int i = 0; i < N; i++) {
 ;            A[i + 7] = A[i] + 1;
 ;            B[i + 5] = B[i] + 1;
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric.ll b/polly/test/IstAstInfo/dependence_distance_parametric.ll
index 3133b732c9dbc..fa569a8386b86 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *A, int N, int c) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int j = 0; j < N; j++)
 ; CHECK:      #pragma minimal dependence distance: max(-c, c)
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = 0; i < N; i++)
 ;               A[i + c] = A[i] + 1;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
index 5cce8c84a903c..7f280e0c542ca 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *A, int N, int c, int v) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int j = 0; j < N; j++)
 ; CHECK:      #pragma minimal dependence distance: max(-c - v, c + v)
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = 0; i < N; i++)
 ;               A[i + c + v] = A[i] + 1;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_varying.ll b/polly/test/IstAstInfo/dependence_distance_varying.ll
index 71c045b69e28a..d609c2f210f8d 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying.ll
@@ -1,9 +1,7 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;         void f(int *A, int N) {
 ; CHECK:    #pragma minimal dependence distance: -(N % 2) + 2
-; PINFO:    for.cond: Loop is not parallel.
 ;           for (int i = 0; i < N; i++)
 ;             A[i] = A[N - i] + 1;
 ;         }
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
index 463e942fc958a..8ed3220353c1b 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-canonicalize -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *restrict A, int *restrict sum) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:    for.cond: Loop is not parallel.
 ;          for (int j = 0; j < 1024; j++)
 ; CHECK:      #pragma minimal dependence distance: 1
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = j; i < 1024; i++)
 ;               A[i - 3] = A[j] * 2 + A[j] + 2;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
index 67917b4a919f7..73768e9c308a4 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
@@ -1,10 +1,8 @@
 ; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D,
 ;               int *restrict E, int N) {
 ; CHECK:   #pragma minimal dependence distance: N >= 35 ? 1 : N >= 17 && N <= 34 ? 2 : 5
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int i = 0; i < N; i++) {
 ;            A[i] = A[100 - 2 * i] + 1;
 ;            B[i] = B[100 - 3 * i] + 1;
diff --git a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
index b588e42df5d1e..697b6ca50d444 100644
--- a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
@@ -1,14 +1,10 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int P, int sum[P][M]) {
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int i = 0; i < N; i++)
-; PINFO-NEXT: for.cond1: Loop is parallel.
 ;             for (int j = 0; j < P; j++)
 ; CHECK:        #pragma simd
-; PINFO-NEXT:   for.cond4: Loop is parallel.
 ;               for (int k = 0; k < M; k++)
 ;                 sum[j][k] ^= j;
 ;        }
diff --git a/polly/test/IstAstInfo/reduction_in_one_dimension.ll b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
index 86a1b67f7292d..797115b6f8d70 100644
--- a/polly/test/IstAstInfo/reduction_in_one_dimension.ll
+++ b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
@@ -1,13 +1,10 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; Verify that we won't privatize anything in the outer dimension
 ;
 ; CHECK:    #pragma known-parallel
-; PINFO:    for.cond: Loop is parallel.
 ; CHECK:    for (int c0 = 0; c0 < 2 * n; c0 += 1)
 ; CHECK:      #pragma simd reduction
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ; CHECK:      for (int c1 = 0; c1 <= 1023; c1 += 1)
 ; CHECK:        Stmt_for_body3(c0, c1);
 ;
diff --git a/polly/test/IstAstInfo/reduction_loop_reversal.ll b/polly/test/IstAstInfo/reduction_loop_reversal.ll
index c940f5c08fa1e..d30119787d8e0 100644
--- a/polly/test/IstAstInfo/reduction_loop_reversal.ll
+++ b/polly/test/IstAstInfo/reduction_loop_reversal.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; CHECK-NOT: #pragma simd{{\s*$}}
 ; CHECK: #pragma simd reduction
@@ -7,9 +6,6 @@
 ; CHECK: #pragma simd{{\s*$}}
 ; CHECK: Stmt_S1(n - c1)
 ;
-; PINFO:       for.cond2: Loop is parallel.
-; PINFO-NEXT:  for.cond: Loop is not parallel.
-;
 ;    void rlr(int *A, long n) {
 ;      for (long i = 0; i < 2 * n; i++)
 ; S0:    A[0] += i;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
index 21a78e5487621..c39ffa591484d 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
@@ -13,9 +12,6 @@
 ; CHECK-NEXT:           Stmt_S0(c1);
 ; CHECK-NEXT:     }
 ;
-; PINFO:      for.cond2: Loop is parallel.
-; PINFO-NEXT: for.cond: Loop is not parallel.
-;
 ;    void rms(int *A, long n) {
 ;      for (long i = 0; i < 2 * n; i++)
 ; S0:    A[0] += i;
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
new file mode 100644
index 0000000000000..05fcbf7beb99f
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
@@ -0,0 +1,28 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+cc_library(
+    name = "lib",
+    srcs = glob(["*.cpp"]),
+    hdrs = glob(["*.h"]),
+    deps = [
+        "//clang:ast",
+        "//clang:ast_matchers",
+        "//clang:ast_matchers_dynamic",
+        "//clang:basic",
+        "//clang:frontend",
+        "//clang:serialization",
+        "//llvm:LineEditor",
+        "//llvm:Support",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
index 2808288c562a9..baad2cf053b8d 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
@@ -23,6 +23,11 @@ bool_flag(
     build_setting_default = True,
 )
 
+bool_flag(
+    name = "enable_custom_checks",
+    build_setting_default = True,
+)
+
 config_setting(
     name = "static_analyzer_enabled",
     flag_values = {
@@ -30,13 +35,25 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "custom_checks_enabled",
+    flag_values = {
+        ":enable_custom_checks": "true",
+    },
+)
+
 expand_template(
     name = "config",
     out = "clang-tidy-config.h",
     substitutions =
-        {
-            "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 0",
-        } | select({
+        select({
+            ":custom_checks_enabled": {
+                "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 1",
+            },
+            "//conditions:default": {
+                "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 0",
+            },
+        }) | select({
             ":static_analyzer_enabled": {
                 "#cmakedefine01 CLANG_TIDY_ENABLE_STATIC_ANALYZER": "#define CLANG_TIDY_ENABLE_STATIC_ANALYZER 1",
             },
@@ -208,6 +225,15 @@ clang_tidy_library(
     ],
 )
 
+clang_tidy_library(
+    name = "custom",
+    deps = [
+        ":lib",
+        "//clang:ast_matchers_dynamic",
+        "//clang-tools-extra/clang-query:lib",
+    ],
+)
+
 clang_tidy_library(
     name = "concurrency",
     deps = [":lib"],
@@ -365,6 +391,9 @@ CHECKS = [
 ] + select({
     ":static_analyzer_enabled": [":mpi"],
     "//conditions:default": [],
+}) + select({
+    ":custom_checks_enabled": [":custom"],
+    "//conditions:default": [],
 })
 
 cc_library(