From c2ba81c780c2eba700db0b6bc7a58b4c3de8ba76 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Mon, 17 Nov 2025 11:58:27 +0000
Subject: [PATCH 001/105] [lldb][nfc] Fix comment about UINT32_MAX in
 UnwindAssemblyInstruction (#168339)

---
 lldb/include/lldb/Core/EmulateInstruction.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/include/lldb/Core/EmulateInstruction.h b/lldb/include/lldb/Core/EmulateInstruction.h
index a9fd4543cbbcb..ff1386cce570b 100644
--- a/lldb/include/lldb/Core/EmulateInstruction.h
+++ b/lldb/include/lldb/Core/EmulateInstruction.h
@@ -385,8 +385,8 @@ class EmulateInstruction : public PluginInterface {
                                         const RegisterInfo *reg_info,
                                         const RegisterValue &reg_value);
 
-  // Type to represent the condition of an instruction. The UINT32 value is
-  // reserved for the unconditional case and all other value can be used in an
+  // Type to represent the condition of an instruction. The UINT32_MAX value is
+  // reserved for the unconditional case and all other values can be used in an
   // architecture dependent way.
   typedef uint32_t InstructionCondition;
   static const InstructionCondition UnconditionalCondition = UINT32_MAX;

From 74c91680e6c9e68c9915ac10eb95c2fe31fd5651 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Mon, 17 Nov 2025 11:58:43 +0000
Subject: [PATCH 002/105] [lldb][nfc] Avoid duplicate calls to
 GetInstructionCondition in UnwindAssemblyInstEmulation (#168340)

---
 .../InstEmulation/UnwindAssemblyInstEmulation.cpp           | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 790f230af74c9..8437a51471ca2 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -174,8 +174,10 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
 
     m_inst_emulator_up->SetInstruction(inst->GetOpcode(), inst->GetAddress(),
                                        nullptr);
+    const EmulateInstruction::InstructionCondition new_condition =
+        m_inst_emulator_up->GetInstructionCondition();
 
-    if (last_condition != m_inst_emulator_up->GetInstructionCondition()) {
+    if (last_condition != new_condition) {
       // If the last instruction was conditional with a different condition
       // than the current condition then restore the state.
       if (last_condition != EmulateInstruction::UnconditionalCondition) {
@@ -190,7 +192,7 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
       condition_block_start_state = it;
     }
 
-    last_condition = m_inst_emulator_up->GetInstructionCondition();
+    last_condition = new_condition;
 
     m_inst_emulator_up->EvaluateInstruction(
         eEmulateInstructionOptionIgnoreConditions);

From fdbb888db53f156699ba3a94d5a0ce2005fd6e3c Mon Sep 17 00:00:00 2001
From: guillem-bartrina-sonarsource <guillem.bartrina@sonarsource.com>
Date: Mon, 17 Nov 2025 13:00:10 +0100
Subject: [PATCH 003/105] [analyzer] StdVariantChecker: fix crashes and
 incorrect retrieval of template arguments (#167341)

Although very unusual, the SVal of the argument is not checked for
UnknownVal, so we may get a null pointer dereference.

In addition, the template arguments of the variant are retrieved
incorrectly when type aliases are involved, causing crashes and FPs/FNs.
---
 .../Checkers/StdVariantChecker.cpp            | 13 ++++++---
 clang/test/Analysis/std-variant-checker.cpp   | 29 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
index db8bbee8761d5..c5dad610bef53 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
@@ -90,6 +90,9 @@ bool isStdVariant(const Type *Type) {
 static std::optional<ArrayRef<TemplateArgument>>
 getTemplateArgsFromVariant(const Type *VariantType) {
   const auto *TempSpecType = VariantType->getAs<TemplateSpecializationType>();
+  while (TempSpecType && TempSpecType->isTypeAlias())
+    TempSpecType =
+        TempSpecType->getAliasedType()->getAs<TemplateSpecializationType>();
   if (!TempSpecType)
     return {};
 
@@ -219,10 +222,12 @@ class StdVariantChecker : public Checker<eval::Call, check::RegionChanges> {
   bool handleStdGetCall(const CallEvent &Call, CheckerContext &C) const {
     ProgramStateRef State = C.getState();
 
-    const auto &ArgType = Call.getArgSVal(0)
-                              .getType(C.getASTContext())
-                              ->getPointeeType()
-                              .getTypePtr();
+    SVal ArgSVal = Call.getArgSVal(0);
+    if (ArgSVal.isUnknown())
+      return false;
+
+    const auto &ArgType =
+        ArgSVal.getType(C.getASTContext())->getPointeeType().getTypePtr();
     // We have to make sure that the argument is an std::variant.
     // There is another std::get with std::pair argument
     if (!isStdVariant(ArgType))
diff --git a/clang/test/Analysis/std-variant-checker.cpp b/clang/test/Analysis/std-variant-checker.cpp
index 7f136c06b19cc..5b256b0f60dd0 100644
--- a/clang/test/Analysis/std-variant-checker.cpp
+++ b/clang/test/Analysis/std-variant-checker.cpp
@@ -355,4 +355,31 @@ void nonInlineFunctionCallPtr() {
   char c = std::get<char> (v); // no-warning
   (void)a;
   (void)c;
-}
\ No newline at end of file
+}
+
+// ----------------------------------------------------------------------------//
+// Misc
+// ----------------------------------------------------------------------------//
+
+void unknownVal() {
+  // force the argument to be UnknownVal
+  (void)std::get<int>(*(std::variant<int, float>*)(int)3.14f); // no crash
+}
+
+template <typename T>
+using MyVariant = std::variant<int, float>;
+
+void typeAlias() {
+  MyVariant<bool> v;
+
+  (void)std::get<int>(v); // no-warning
+}
+
+template <template<typename> typename Container>
+using MySpecialVariant = std::variant<int, float>;
+
+void complexTypeAlias() {
+  MySpecialVariant<std::vector> v;
+
+  (void)std::get<int>(v); // no crash
+}

From 7e730da128df186a63c6fde19693129322024c50 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 12:01:40 +0000
Subject: [PATCH 004/105] [VPlan] Add printRecipe, prepare printing metadata in
 ::print (NFC) (#166244)

Add a new pinrRecipe which handles printing the recipe without common
info like debug info or metadata.

Prepares to print them once, in ::print(), after/in combination with
https://github.com/llvm/llvm-project/pull/165825.

PR: https://github.com/llvm/llvm-project/pull/166244
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 422 ++++++++++--------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 165 +++----
 .../Transforms/Vectorize/VPlanTest.cpp        |   8 +-
 3 files changed, 326 insertions(+), 269 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 754c6b50ae028..13131a2b61722 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -483,12 +483,25 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
   /// Set the recipe's debug location to \p NewDL.
   void setDebugLoc(DebugLoc NewDL) { DL = NewDL; }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe, delegating to printRecipe().
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override final;
+#endif
+
 protected:
   /// Compute the cost of this recipe either using a recipe's specialized
   /// implementation or using the legacy cost model and the underlying
   /// instructions.
   virtual InstructionCost computeCost(ElementCount VF,
                                       VPCostContext &Ctx) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Each concrete VPRecipe prints itself, without printing common information,
+  /// like debug info or metadata.
+  virtual void printRecipe(raw_ostream &O, const Twine &Indent,
+                           VPSlotTracker &SlotTracker) const = 0;
+#endif
 };
 
 // Helper macro to define common classof implementations for recipes.
@@ -1133,10 +1146,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
                               VPCostContext &Ctx) const override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the VPInstruction to \p O.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-
   /// Print the VPInstruction to dbgs() (for debugging).
   LLVM_DUMP_METHOD void dump() const;
 #endif
@@ -1182,6 +1191,13 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
 
   /// Returns the symbolic name assigned to the VPInstruction.
   StringRef getName() const { return Name; }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the VPInstruction to \p O.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A specialization of VPInstruction augmenting it with a dedicated result
@@ -1247,10 +1263,11 @@ class VPInstructionWithType : public VPInstruction {
 
   Type *getResultType() const { return ResultTy; }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -1339,13 +1356,13 @@ struct LLVM_ABI_FOR_TEST VPPhi : public VPInstruction, public VPPhiAccessors {
 
   void execute(VPTransformState &State) override;
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 
-protected:
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
@@ -1385,12 +1402,6 @@ class VPIRInstruction : public VPRecipeBase {
 
   Instruction &getInstruction() const { return I; }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   bool usesScalars(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
@@ -1413,6 +1424,13 @@ class VPIRInstruction : public VPRecipeBase {
   /// Builder. Must only be used for VPIRInstructions with at least one operand
   /// wrapping a PHINode.
   void extractLastLaneOfFirstOperand(VPBuilder &Builder);
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// An overlay for VPIRInstructions wrapping PHI nodes enabling convenient use
@@ -1432,13 +1450,13 @@ struct LLVM_ABI_FOR_TEST VPIRPhi : public VPIRInstruction,
 
   void execute(VPTransformState &State) override;
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 
-protected:
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
@@ -1482,10 +1500,11 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
 
   unsigned getOpcode() const { return Opcode; }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -1535,16 +1554,17 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   Instruction::CastOps getOpcode() const { return Opcode; }
 
   /// Returns the result type of the cast.
   Type *getResultType() const { return ResultTy; }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for widening vector intrinsics.
@@ -1626,13 +1646,14 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   /// Returns true if the intrinsic may have side-effects.
   bool mayHaveSideEffects() const { return MayHaveSideEffects; }
 
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
+
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
-
-  bool usesFirstLaneOnly(const VPValue *Op) const override;
 };
 
 /// A recipe for widening Call instructions using library calls.
@@ -1678,10 +1699,11 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
   operand_range args() { return drop_end(operands()); }
   const_operand_range args() const { return drop_end(operands()); }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -1722,10 +1744,11 @@ class VPHistogramRecipe : public VPRecipeBase {
     return getNumOperands() == 3 ? getOperand(2) : nullptr;
   }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -1754,12 +1777,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   unsigned getOpcode() const { return Instruction::Select; }
 
   VPValue *getCond() const {
@@ -1772,6 +1789,13 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
            "Op must be an operand of the recipe");
     return Op == getCond() && Op->isDefinedOutsideLoopRegions();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for handling GEP instructions.
@@ -1826,12 +1850,6 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
     return 0;
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -1841,6 +1859,13 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
     else
       return !isPointerLoopInvariant() && Op->isDefinedOutsideLoopRegions();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe to compute a pointer to the last element of each part of a widened
@@ -1897,10 +1922,11 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
                                         getDebugLoc());
   }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -1952,10 +1978,11 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
     return 0;
   }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -2013,12 +2040,6 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe,
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override = 0;
-#endif
-
   /// Returns the start value of the phi, if one is set.
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
@@ -2043,6 +2064,13 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe,
   virtual VPRecipeBase &getBackedgeRecipe() {
     return *getBackedgeValue()->getDefiningRecipe();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override = 0;
+#endif
 };
 
 /// Base class for widened induction (VPWidenIntOrFpInductionRecipe and
@@ -2168,12 +2196,6 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
                      "expandVPWidenIntOrFpInductionRecipe");
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   VPValue *getSplatVFValue() {
     // If the recipe has been unrolled return the VPValue for the induction
     // increment.
@@ -2207,6 +2229,13 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   VPValue *getLastUnrolledPartOperand() {
     return isUnrolled() ? getOperand(getNumOperands() - 1) : this;
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe {
@@ -2246,10 +2275,11 @@ class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe {
   /// Returns true if only scalar values will be generated.
   bool onlyScalarsGenerated(bool IsScalable);
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -2263,9 +2293,6 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe,
   /// Name to use for the generated IR instruction for the widened phi.
   std::string Name;
 
-protected:
-  const VPRecipeBase *getAsRecipe() const override { return this; }
-
 public:
   /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and
   /// debug location \p DL.
@@ -2291,11 +2318,14 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe,
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
+
+  const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
 /// A recipe for handling first-order recurrence phis. The start value is the
@@ -2318,18 +2348,19 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return Op == getStartValue();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for handling reduction phis. The start value is the first operand
@@ -2378,12 +2409,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// Get the factor that the VF of this recipe's output should be scaled by.
   unsigned getVFScaleFactor() const { return VFScaleFactor; }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns the number of incoming values, also number of incoming blocks.
   /// Note that at the moment, VPWidenPointerInductionRecipe only has a single
   /// incoming value, its start value.
@@ -2404,6 +2429,13 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
            "Op must be an operand of the recipe");
     return isOrdered() || isInLoop();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
@@ -2461,12 +2493,6 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -2476,6 +2502,13 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
     return all_of(users(),
                   [this](VPUser *U) { return U->usesFirstLaneOnly(this); });
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A common base class for interleaved memory operations.
@@ -2602,12 +2635,6 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
@@ -2617,6 +2644,13 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase {
   unsigned getNumStoreOperands() const override {
     return getNumOperands() - (getMask() ? 2 : 1);
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for interleaved memory operations with vector-predication
@@ -2649,12 +2683,6 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// The recipe only uses the first lane of the address, and EVL operand.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -2666,6 +2694,13 @@ class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase {
   unsigned getNumStoreOperands() const override {
     return getNumOperands() - (getMask() ? 3 : 2);
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2742,12 +2777,6 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Return the recurrence kind for the in-loop reduction.
   RecurKind getRecurrenceKind() const { return RdxKind; }
   /// Return true if the in-loop reduction is ordered.
@@ -2762,6 +2791,13 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
   VPValue *getCondOp() const {
     return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for forming partial reductions. In the loop, an accumulator and
@@ -2819,10 +2855,11 @@ class VPPartialReductionRecipe : public VPReductionRecipe {
   /// Get the factor that the VF of this recipe's output should be scaled by.
   unsigned getVFScaleFactor() const { return VFScaleFactor; }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -2852,12 +2889,6 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// The VPValue of the explicit vector length.
   VPValue *getEVL() const { return getOperand(2); }
 
@@ -2867,6 +2898,13 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
            "Op must be an operand of the recipe");
     return Op == getEVL();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
@@ -2913,12 +2951,6 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   bool isSingleScalar() const { return IsSingleScalar; }
 
   bool isPredicated() const { return IsPredicated; }
@@ -2949,6 +2981,13 @@ class LLVM_ABI_FOR_TEST VPReplicateRecipe : public VPRecipeWithIRFlags,
   }
 
   unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
@@ -2973,8 +3012,8 @@ class LLVM_ABI_FOR_TEST VPBranchOnMaskRecipe : public VPRecipeBase {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override {
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override {
     O << Indent << "BRANCH-ON-MASK ";
     printOperands(O, SlotTracker);
   }
@@ -3115,12 +3154,6 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if this expression contains recipes that may read from or
   /// write to memory.
   bool mayReadOrWriteMemory() const;
@@ -3131,6 +3164,13 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
 
   /// Returns true if the result of this VPExpressionRecipe is a single-scalar.
   bool isSingleScalar() const;
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -3163,18 +3203,19 @@ class LLVM_ABI_FOR_TEST VPPredInstPHIRecipe : public VPSingleDefRecipe {
     return 0;
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe uses scalars of operand \p Op.
   bool usesScalars(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
     return true;
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A common base class for widening memory operations. An optional mask can be
@@ -3292,12 +3333,6 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
   /// Generate a wide load or gather.
   void execute(VPTransformState &State) override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -3306,6 +3341,13 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
     // their address.
     return Op == getAddr() && isConsecutive();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for widening load operations with vector-predication intrinsics,
@@ -3333,12 +3375,6 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -3347,6 +3383,13 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
     // only demand the first lane of their address.
     return Op == getEVL() || (Op == getAddr() && isConsecutive());
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for widening store operations, using the stored value, the address
@@ -3374,12 +3417,6 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
   /// Generate a wide store or scatter.
   void execute(VPTransformState &State) override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -3388,6 +3425,13 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
     // unless the same operand is also stored.
     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for widening store operations with vector-predication intrinsics,
@@ -3417,12 +3461,6 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool usesFirstLaneOnly(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
@@ -3436,6 +3474,13 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
     // happen with opaque pointers.
     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// Recipe to expand a SCEV expression.
@@ -3463,13 +3508,14 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe {
     return 0;
   }
 
+  const SCEV *getSCEV() const { return Expr; }
+
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
-
-  const SCEV *getSCEV() const { return Expr; }
 };
 
 /// Canonical scalar induction phi of the vector loop. Starting at the specified
@@ -3496,12 +3542,6 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
                      "scalar phi recipe");
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  LLVM_ABI_FOR_TEST void print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const override;
-#endif
-
   /// Returns the scalar type of the induction.
   Type *getScalarType() const {
     return getStartValue()->getLiveInIRValue()->getType();
@@ -3527,6 +3567,13 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
     // For now, match the behavior of the legacy cost model.
     return 0;
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for generating the active lane mask for the vector loop that is
@@ -3553,10 +3600,11 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
   /// Generate the active lane mask phi of the vector loop.
   void execute(VPTransformState &State) override;
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -3596,10 +3644,11 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
     return true;
   }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  LLVM_ABI_FOR_TEST void print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const override;
+  LLVM_ABI_FOR_TEST void printRecipe(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -3631,10 +3680,11 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
     return 0;
   }
 
+protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
 #endif
 };
 
@@ -3686,12 +3736,6 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
     return 0;
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   Type *getScalarType() const {
     return getStartValue()->getLiveInIRValue()->getType();
   }
@@ -3705,6 +3749,13 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
            "Op must be an operand of the recipe");
     return true;
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
@@ -3756,12 +3807,6 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
     return 0;
   }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
   VPValue *getStepValue() const { return getOperand(1); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
@@ -3770,6 +3815,13 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
            "Op must be an operand of the recipe");
     return true;
   }
+
+protected:
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// Casting from VPRecipeBase -> VPPhiAccessors is supported for all recipe
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 84bf0d525b86e..cf95b4eac9d75 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -424,8 +424,8 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
+void VPPartialReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                           VPSlotTracker &SlotTracker) const {
   O << Indent << "PARTIAL-REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
@@ -485,6 +485,11 @@ FastMathFlags VPIRFlags::getFastMathFlags() const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPSingleDefRecipe::dump() const { VPDef::dump(); }
+
+void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
+                         VPSlotTracker &SlotTracker) const {
+  printRecipe(O, Indent, SlotTracker);
+}
 #endif
 
 template <unsigned PartOpIdx>
@@ -1369,11 +1374,11 @@ bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPInstruction::dump() const {
   VPSlotTracker SlotTracker(getParent()->getPlan());
-  print(dbgs(), "", SlotTracker);
+  printRecipe(dbgs(), "", SlotTracker);
 }
 
-void VPInstruction::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
+void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
 
   if (hasResult()) {
@@ -1508,8 +1513,8 @@ void VPInstructionWithType::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
+void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
+                                        VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
   printAsOperand(O, SlotTracker);
   O << " = ";
@@ -1553,8 +1558,8 @@ void VPPhi::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPhi::print(raw_ostream &O, const Twine &Indent,
-                  VPSlotTracker &SlotTracker) const {
+void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
+                        VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
   printAsOperand(O, SlotTracker);
   O << " = phi ";
@@ -1596,8 +1601,8 @@ void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
-                            VPSlotTracker &SlotTracker) const {
+void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
   O << Indent << "IR " << I;
 }
 #endif
@@ -1652,9 +1657,9 @@ void VPPhiAccessors::printPhiOperands(raw_ostream &O,
 #endif
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
-                    VPSlotTracker &SlotTracker) const {
-  VPIRInstruction::print(O, Indent, SlotTracker);
+void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  VPIRInstruction::printRecipe(O, Indent, SlotTracker);
 
   if (getNumOperands() != 0) {
     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
@@ -1739,8 +1744,8 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CALL ";
 
   Function *CalledFn = getCalledScalarFunction();
@@ -1874,8 +1879,8 @@ bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
-                                   VPSlotTracker &SlotTracker) const {
+void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                         VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-INTRINSIC ";
   if (ResultTy->isVoidTy()) {
     O << "void ";
@@ -1961,8 +1966,8 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-HISTOGRAM buckets: ";
   getOperand(0)->printAsOperand(O, SlotTracker);
 
@@ -1980,8 +1985,8 @@ void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
   }
 }
 
-void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
+void VPWidenSelectRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-SELECT ";
   printAsOperand(O, SlotTracker);
   O << " = select ";
@@ -2274,8 +2279,8 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
+void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode);
@@ -2354,8 +2359,8 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-CAST ";
   printAsOperand(O, SlotTracker);
   O << " = " << Instruction::getOpcodeName(Opcode);
@@ -2378,8 +2383,8 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                          VPSlotTracker &SlotTracker) const {
+void VPWidenIntOrFpInductionRecipe::printRecipe(
+    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-INDUCTION  ";
@@ -2403,8 +2408,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = DERIVED-IV ";
@@ -2511,8 +2516,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
+void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                        VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = SCALAR-STEPS ";
@@ -2576,8 +2581,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
-                             VPSlotTracker &SlotTracker) const {
+void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-GEP ";
   O << (isPointerLoopInvariant() ? "Inv" : "Var");
   for (size_t I = 0; I < getNumOperands() - 1; ++I)
@@ -2629,8 +2634,8 @@ void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
+void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                           VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = vector-end-pointer";
@@ -2654,8 +2659,8 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
+void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                        VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
   O << " = vector-pointer ";
@@ -2679,8 +2684,8 @@ InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
-                          VPSlotTracker &SlotTracker) const {
+void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
   O << Indent << "BLEND ";
   printAsOperand(O, SlotTracker);
   O << " =";
@@ -2963,8 +2968,8 @@ bool VPExpressionRecipe::isSingleScalar() const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
-void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const {
+void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
   O << Indent << "EXPRESSION ";
   printAsOperand(O, SlotTracker);
   O << " = ";
@@ -3052,8 +3057,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
   }
 }
 
-void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
@@ -3072,8 +3077,8 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << ")";
 }
 
-void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
-                                 VPSlotTracker &SlotTracker) const {
+void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                       VPSlotTracker &SlotTracker) const {
   O << Indent << "REDUCE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
@@ -3432,8 +3437,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
 
   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
@@ -3545,8 +3550,8 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                VPSlotTracker &SlotTracker) const {
+void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
   printAsOperand(O, SlotTracker);
   O << " = ";
@@ -3637,8 +3642,8 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
-                              VPSlotTracker &SlotTracker) const {
+void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = load ";
@@ -3716,8 +3721,8 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
-                                 VPSlotTracker &SlotTracker) const {
+void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                       VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
   printAsOperand(O, SlotTracker);
   O << " = vp.load ";
@@ -3760,8 +3765,8 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const {
+void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN store ";
   printOperands(O, SlotTracker);
 }
@@ -3825,8 +3830,8 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
+void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                        VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN vp.store ";
   printOperands(O, SlotTracker);
 }
@@ -4089,8 +4094,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const {
+void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
   const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
@@ -4232,8 +4237,8 @@ void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
+void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                        VPSlotTracker &SlotTracker) const {
   const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
@@ -4306,8 +4311,8 @@ InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                   VPSlotTracker &SlotTracker) const {
+void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                         VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = CANONICAL-INDUCTION ";
@@ -4321,8 +4326,8 @@ bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                          VPSlotTracker &SlotTracker) const {
+void VPWidenPointerInductionRecipe::printRecipe(
+    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
   assert((getNumOperands() == 3 || getNumOperands() == 5) &&
          "unexpected number of operands");
   O << Indent << "EMIT ";
@@ -4341,8 +4346,8 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
   }
 }
 
-void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
-                               VPSlotTracker &SlotTracker) const {
+void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = EXPAND SCEV " << *Expr;
@@ -4368,8 +4373,8 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
+void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                           VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
   O << " = WIDEN-CANONICAL-INDUCTION ";
@@ -4416,8 +4421,8 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                            VPSlotTracker &SlotTracker) const {
+void VPFirstOrderRecurrencePHIRecipe::printRecipe(
+    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
   printAsOperand(O, SlotTracker);
   O << " = phi ";
@@ -4451,8 +4456,8 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                 VPSlotTracker &SlotTracker) const {
+void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                       VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-REDUCTION-PHI ";
 
   printAsOperand(O, SlotTracker);
@@ -4471,8 +4476,8 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                             VPSlotTracker &SlotTracker) const {
+void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-PHI ";
 
   printAsOperand(O, SlotTracker);
@@ -4494,8 +4499,8 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                      VPSlotTracker &SlotTracker) const {
+void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                            VPSlotTracker &SlotTracker) const {
   O << Indent << "ACTIVE-LANE-MASK-PHI ";
 
   printAsOperand(O, SlotTracker);
@@ -4505,8 +4510,8 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
+void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
+                                        VPSlotTracker &SlotTracker) const {
   O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
 
   printAsOperand(O, SlotTracker);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 53a9f45fd5c3a..ee7fa175ca918 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -749,7 +749,7 @@ TEST_F(VPBasicBlockTest, print) {
     std::string I3Dump;
     raw_string_ostream OS(I3Dump);
     VPSlotTracker SlotTracker;
-    I3->print(OS, "", SlotTracker);
+    cast<VPRecipeBase>(I3)->print(OS, "", SlotTracker);
     EXPECT_EQ("EMIT store <badref>, <badref>", I3Dump);
   }
 
@@ -818,7 +818,7 @@ Successor(s): ir-bb<scalar.header>
     std::string I3Dump;
     raw_string_ostream OS(I3Dump);
     VPSlotTracker SlotTracker(&Plan);
-    I3->print(OS, "", SlotTracker);
+    cast<VPRecipeBase>(I3)->print(OS, "", SlotTracker);
     EXPECT_EQ("EMIT store vp<%1>, vp<%2>", I3Dump);
   }
 
@@ -1726,8 +1726,8 @@ struct VPDoubleValueDef : public VPRecipeBase {
 
   void execute(struct VPTransformState &State) override {}
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override {}
+  void printRecipe(raw_ostream &O, const Twine &Indent,
+                   VPSlotTracker &SlotTracker) const override {}
 #endif
 };
 

From 82ba3f5d316c102aad1b0721d64c028a8724a3a4 Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Mon, 17 Nov 2025 13:10:54 +0100
Subject: [PATCH 005/105] [clang][DebugInfo] Clear retained nodes list of
 vararg trunk's DISubprogram (#167758)

This fixes the issue reported in
https://github.com/llvm/llvm-project/pull/166855#issuecomment-3518604073
that had been revealed after
https://github.com/llvm/llvm-project/pull/166855 was merged.

`CodeGenFunction::GenerateVarArgsThunk` creates thunks for vararg
functions by cloning and modifying them. It is different from
`CodeGenFunction::generateThunk`, which is used for Itanium ABI.

According to https://reviews.llvm.org/D39396,
`CodeGenFunction::GenerateVarArgsThunk` may be called before metadata
nodes are resolved. So, it tries to avoid remapping DISubprogram and all
metadata nodes it references inside `CloneFunction()` by manually
cloning DISubprogram.

If optimization level is not OptNone, DILocalVariables for a function
are saved in DISubprogram's retainedNodes field. When
`CodeGenFunction::GenerateVarArgsThunk` clones such DISubprogram without
remapping, it produces a subprogram with incorrectly-scoped retained
nodes. It triggers Verifier checks added in
https://github.com/llvm/llvm-project/pull/166855.

To solve that, retained nodes list of a cloned DISubprogram is cleared.
---
 clang/lib/CodeGen/CGVTables.cpp         |  5 +++++
 clang/test/CodeGenCXX/tmp-md-nodes1.cpp | 16 ++++++++++++++++
 clang/test/CodeGenCXX/tmp-md-nodes2.cpp | 16 ++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 00d9f93effb32..c95bd9a3067a9 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -125,6 +125,11 @@ static void resolveTopLevelMetadata(llvm::Function *Fn,
   if (!DIS)
     return;
   auto *NewDIS = llvm::MDNode::replaceWithDistinct(DIS->clone());
+  // As DISubprogram remapping is avoided, clear retained nodes list of
+  // cloned DISubprogram from retained nodes local to original DISubprogram.
+  // FIXME: Thunk function signature is produced wrong in DWARF, as retained
+  // nodes are not remapped.
+  NewDIS->replaceRetainedNodes(llvm::MDTuple::get(Fn->getContext(), {}));
   VMap.MD()[DIS].reset(NewDIS);
 
   // Find all llvm.dbg.declare intrinsics and resolve the DILocalVariable nodes
diff --git a/clang/test/CodeGenCXX/tmp-md-nodes1.cpp b/clang/test/CodeGenCXX/tmp-md-nodes1.cpp
index 524b2c08c1ad5..f39dca3edaed1 100644
--- a/clang/test/CodeGenCXX/tmp-md-nodes1.cpp
+++ b/clang/test/CodeGenCXX/tmp-md-nodes1.cpp
@@ -2,6 +2,14 @@
 // RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm %s -o - | \
 // RUN: FileCheck %s
 
+// Trigger GenerateVarArgsThunk.
+// RUN: %clang_cc1 -O0 -triple riscv64-linux-gnu -debug-info-kind=limited -emit-llvm %s -o - | \
+// RUN: FileCheck %s
+
+// Check that retainedNodes are properly maintained at function cloning.
+// RUN: %clang_cc1 -O1 -triple riscv64-linux-gnu -debug-info-kind=limited -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DI
+
 // This test simply checks that the varargs thunk is created. The failing test
 // case asserts.
 
@@ -16,3 +24,11 @@ struct CharlieImpl : Charlie, Alpha {
 } delta;
 
 // CHECK: define {{.*}} void @_ZThn{{[48]}}_N11CharlieImpl5bravoEz(
+
+// CHECK-DI: distinct !DISubprogram({{.*}}, linkageName: "_ZN11CharlieImpl5bravoEz", {{.*}}, retainedNodes: [[RN1:![0-9]+]]
+// A non-empty retainedNodes list of original DISubprogram.
+// CHECK-DI: [[RN1]] = !{!{{.*}}}
+
+// CHECK-DI: distinct !DISubprogram({{.*}}, linkageName: "_ZN11CharlieImpl5bravoEz", {{.*}}, retainedNodes: [[EMPTY:![0-9]+]]
+// An empty retainedNodes list of cloned DISubprogram.
+// CHECK-DI: [[EMPTY]] = !{}
diff --git a/clang/test/CodeGenCXX/tmp-md-nodes2.cpp b/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
index 8500cf3c42393..0c323ae4f58aa 100644
--- a/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
+++ b/clang/test/CodeGenCXX/tmp-md-nodes2.cpp
@@ -2,6 +2,14 @@
 // RUN: %clang_cc1 -O0 -triple %itanium_abi_triple -debug-info-kind=limited -emit-llvm %s -o - | \
 // RUN: FileCheck %s
 
+// Trigger GenerateVarArgsThunk.
+// RUN: %clang_cc1 -O0 -triple riscv64-linux-gnu -debug-info-kind=limited -emit-llvm %s -o - | \
+// RUN: FileCheck %s
+
+// Check that retainedNodes are properly maintained at function cloning.
+// RUN: %clang_cc1 -O1 -triple riscv64-linux-gnu -debug-info-kind=limited -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DI
+
 // This test simply checks that the varargs thunk is created. The failing test
 // case asserts.
 
@@ -31,3 +39,11 @@ BOOL CBdVfsImpl::ReqCacheHint( CMsgAgent* p_ma, CACHE_HINT hint, ... ) {
 }
 
 // CHECK: define {{.*}} @_ZThn{{[48]}}_N10CBdVfsImpl12ReqCacheHintEP9CMsgAgentN3CFs10CACHE_HINTEz(
+
+// An empty retainedNodes list of cloned DISubprogram.
+// CHECK-DI: [[EMPTY:![0-9]+]] = !{}
+// CHECK-DI: distinct !DISubprogram({{.*}}, linkageName: "_ZN10CBdVfsImpl12ReqCacheHintEP9CMsgAgentN3CFs10CACHE_HINTEz", {{.*}}, retainedNodes: [[RN1:![0-9]+]]
+// A non-empty retainedNodes list of original DISubprogram.
+// CHECK-DI: [[RN1]] = !{!{{.*}}}
+
+// CHECK-DI: distinct !DISubprogram({{.*}}, linkageName: "_ZN10CBdVfsImpl12ReqCacheHintEP9CMsgAgentN3CFs10CACHE_HINTEz", {{.*}}, retainedNodes: [[EMPTY]]

From 3ee54a6b992c6053726764905030946f8bc10cd0 Mon Sep 17 00:00:00 2001
From: Katya Romanova <56653669+romanova-ekaterina@users.noreply.github.com>
Date: Mon, 17 Nov 2025 04:24:26 -0800
Subject: [PATCH 006/105] [DTLTO] [LLVM] Initial DTLTO cache implementation
 (#156433)

This patch implements DTLTO cache. DTLTO cache is implemented the same
way as ThinLTO cache. In fact the same class Cache is used for both of
them.

 Because parameters for codegen are different for DTLTO and ThinLTO
 (DTLTO codegen is done by invoking clang and its codegen parameters are
 not fully synchronized with codegen parameters used by LTO backend).
 The object files generated by DTLTO and ThinLTO might be different and
 shouldn't be mixed. If ThinLTO and DTLTO share the same cache
 directory, the cache file won't interfere with each other.

 I added a couple of test files in cross-project-test/dtlto directory,
 but if more tests are required for initial implementation, I could add
 them.
---
 cross-project-tests/dtlto/dtlto-cache.test    |  89 +++++++++
 .../dtlto/dtlto-thinlto-cache.test            |  70 ++++++++
 llvm/include/llvm/LTO/Config.h                |   5 +
 llvm/lib/LTO/LTO.cpp                          | 170 ++++++++++++++----
 llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll    |  74 ++++++++
 5 files changed, 371 insertions(+), 37 deletions(-)
 create mode 100644 cross-project-tests/dtlto/dtlto-cache.test
 create mode 100644 cross-project-tests/dtlto/dtlto-thinlto-cache.test
 create mode 100644 llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll

diff --git a/cross-project-tests/dtlto/dtlto-cache.test b/cross-project-tests/dtlto/dtlto-cache.test
new file mode 100644
index 0000000000000..b98d4dbb433bb
--- /dev/null
+++ b/cross-project-tests/dtlto/dtlto-cache.test
@@ -0,0 +1,89 @@
+REQUIRES: x86-registered-target, ld.lld
+
+# Show that the ThinLTO cache works with DTLTO.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+# Compile source files into bitcode files.
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c foo.c main.c
+
+# Execute the linker and check that the cache is populated.
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.o foo.o -o populate1.elf \
+RUN:   -Wl,--thinlto-distributor=%python \
+RUN:   -Wl,--thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--thinlto-remote-compiler=%clang \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+# Check that there are two backend compilation jobs occurred.
+RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx 3
+RUN: ls cache.dir/llvmcache.timestamp
+RUN: ls cache.dir | count 3
+
+# Execute the linker again and check that a fully populated cache is used correctly, 
+# i.e., no additional cache entries are created for cache hits.
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.o foo.o -o populate2.elf \
+RUN:   -Wl,--thinlto-distributor=%python \
+RUN:   -Wl,--thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--thinlto-remote-compiler=%clang \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+# Check that there are no backend compilation jobs occurred.
+RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx 1
+RUN: ls cache.dir | count 3
+
+RUN: %clang -O0 --target=x86_64-linux-gnu -flto=thin -c foo.c -o foo.O0.o
+RUN: %clang -O0 --target=x86_64-linux-gnu -flto=thin -c main.c -o main.O0.o
+
+# Execute the linker again and check that the cache is populated correctly when there 
+# are no cache hits but there are existing cache entries.
+# As a side effect, this also verifies that the optimization level is considered when 
+# evaluating the cache entry key.
+
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.O0.o foo.O0.o -o populate3.elf \
+RUN:   -Wl,--thinlto-distributor=%python \
+RUN:   -Wl,--thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--thinlto-remote-compiler=%clang \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+# Check that there are two new backend compilation jobs occurred.
+RUN: grep -wo args populate3.*.dist-file.json | wc -l | grep -qx 3
+RUN: ls cache.dir | count 5
+
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c main-partial.c 
+
+# Execute the linker and check that everything works correctly with the partially populated cache;
+# One more cache entry should be generated after this run.
+
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main-partial.o foo.o -o main-partial.elf \
+RUN:   -Wl,--thinlto-distributor=%python \
+RUN:   -Wl,--thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--thinlto-remote-compiler=%clang \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+# Check that there is one new backend compilation jobs occurred.
+RUN: grep -wo args main-partial.*.dist-file.json | wc -l | grep -qx 2
+RUN: ls cache.dir | count 6
+
+#--- foo.c
+volatile int foo_int;
+__attribute__((retain)) int foo(int x) { return x + foo_int; }
+
+#--- main.c
+extern int foo(int x);
+__attribute__((retain)) int main(int argc, char** argv) {
+  return foo(argc);
+}
+
+#--- main-partial.c
+extern int foo(int x);
+__attribute__((retain)) int main(int argc, char** argv) {
+  return foo(argc+1);
+}
diff --git a/cross-project-tests/dtlto/dtlto-thinlto-cache.test b/cross-project-tests/dtlto/dtlto-thinlto-cache.test
new file mode 100644
index 0000000000000..c177112e2dbbd
--- /dev/null
+++ b/cross-project-tests/dtlto/dtlto-thinlto-cache.test
@@ -0,0 +1,70 @@
+REQUIRES: x86-registered-target, ld.lld
+
+# This test verifies that a cache populated by an in-process ThinLTO codegen is
+# not reused by an out-of-process (DTLTO) codegen and vice versa.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+# Compile source files into bitcode files.
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c foo.c main.c
+
+# Execute the linker and check that in-process ThinLTO cache is populated.
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.o foo.o -o main.elf \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+RUN: ls cache.dir/llvmcache.timestamp
+RUN: ls cache.dir | count 3
+
+# Execute the linker and check that out-of-process codegen (DTLTO) adds
+# additional entries to the cache, implying that in-process and
+# out-of-process codegens do not share cache entries.
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.o foo.o -o populate1.elf \
+RUN:   -Wl,--thinlto-distributor=%python \
+RUN:   -Wl,--thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--thinlto-remote-compiler=%clang \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+# Check that there are two backend compilation jobs occurred.
+RUN: grep -wo args populate1.*.dist-file.json | wc -l | grep -qx 3
+RUN: ls cache.dir | count 5
+
+# Clean up cache directory.
+RUN: rm -rf cache.dir
+
+# Execute the linker and check that out-of-process (DTLTO) cache is populated.
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.o foo.o -o populate2.elf \
+RUN:   -Wl,--thinlto-distributor=%python \
+RUN:   -Wl,--thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--thinlto-remote-compiler=%clang \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+# Check that there are two backend compilation jobs occurred.
+RUN: grep -wo args populate2.*.dist-file.json | wc -l | grep -qx 3
+RUN: ls cache.dir/llvmcache.timestamp
+RUN: ls cache.dir | count 3
+
+# Execute the linker and check that in-process codegen adds additional entries
+# to the cache, implying that in-process and out-of-process codegens do
+# not share cache entries.
+RUN: %clang -O2 --target=x86_64-linux-gnu -Werror -flto=thin -fuse-ld=lld -nostdlib -e main \
+RUN:   main.o foo.o -o main.elf \
+RUN:   -Wl,--thinlto-cache-dir=cache.dir \
+RUN:   -Wl,--save-temps
+
+RUN: ls cache.dir | count 5
+
+#--- foo.c
+volatile int foo_int;
+__attribute__((retain)) int foo(int x) { return x + foo_int; }
+
+#--- main.c
+extern int foo(int x);
+__attribute__((retain)) int main(int argc, char** argv) {
+  return foo(argc);
+}
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 50e143c518213..566a87ed1a790 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -94,6 +94,11 @@ struct Config {
   /// need to create copies, so it can set this field to false.
   bool KeepSymbolNameCopies = true;
 
+  /// This flag is used as one of parameters to calculate cache entries and to
+  /// ensure that in-process cache and out-of-process (DTLTO) cache are
+  /// distinguished.
+  mutable bool Dtlto = 0;
+
   /// Allows non-imported definitions to get the potentially more constraining
   /// visibility from the prevailing definition. FromPrevailing is the default
   /// because it works for many binary formats. ELF can use the more optimized
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index fefc733fa7697..a02af59600c44 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -169,6 +169,7 @@ std::string llvm::computeLTOCacheKey(
   AddString(Conf.OverrideTriple);
   AddString(Conf.DefaultTriple);
   AddString(Conf.DwoDir);
+  AddUint8(Conf.Dtlto);
 
   // Include the hash for the current module
   auto ModHash = Index.getModuleHash(ModuleID);
@@ -2226,7 +2227,8 @@ class OutOfProcessThinBackend : public CGThinBackend {
 
   SmallVector<StringRef, 0> CodegenOptions;
   DenseSet<StringRef> CommonInputs;
-
+  // Number of the object files that have been already cached.
+  std::atomic<size_t> CachedJobs{0};
   // Information specific to individual backend compilation job.
   struct Job {
     unsigned Task;
@@ -2234,6 +2236,9 @@ class OutOfProcessThinBackend : public CGThinBackend {
     StringRef NativeObjectPath;
     StringRef SummaryIndexPath;
     ImportsFilesContainer ImportsFiles;
+    std::string CacheKey;
+    AddStreamFn CacheAddStream;
+    bool Cached = false;
   };
   // The set of backend compilations jobs.
   SmallVector<Job> Jobs;
@@ -2247,12 +2252,15 @@ class OutOfProcessThinBackend : public CGThinBackend {
   // The target triple to supply for backend compilations.
   llvm::Triple Triple;
 
+  // Cache
+  FileCache Cache;
+
 public:
   OutOfProcessThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      AddStreamFn AddStream, lto::IndexWriteCallback OnWrite,
+      AddStreamFn AddStream, FileCache CacheFn, lto::IndexWriteCallback OnWrite,
       bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles,
       StringRef LinkerOutputFile, StringRef Distributor,
       ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler,
@@ -2264,7 +2272,8 @@ class OutOfProcessThinBackend : public CGThinBackend {
         LinkerOutputFile(LinkerOutputFile), DistributorPath(Distributor),
         DistributorArgs(DistributorArgs), RemoteCompiler(RemoteCompiler),
         RemoteCompilerPrependArgs(RemoteCompilerPrependArgs),
-        RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps) {}
+        RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps),
+        Cache(std::move(CacheFn)) {}
 
   void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset,
              llvm::Triple Triple) override {
@@ -2272,6 +2281,54 @@ class OutOfProcessThinBackend : public CGThinBackend {
     Jobs.resize((size_t)ThinLTONumTasks);
     this->ThinLTOTaskOffset = ThinLTOTaskOffset;
     this->Triple = Triple;
+    this->Conf.Dtlto = 1;
+  }
+
+  virtual Error runThinLTOBackendThread(
+      Job &J, const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
+          &ResolvedODR) {
+
+    llvm::TimeTraceScope timeScope(
+        "Run ThinLTO backend thread (out-of-process)", J.ModuleID);
+
+    if (auto E = emitFiles(ImportList, J.ModuleID, J.ModuleID.str(),
+                           J.SummaryIndexPath, J.ImportsFiles))
+      return E;
+
+    if (!Cache.isValid() || !CombinedIndex.modulePaths().count(J.ModuleID) ||
+        all_of(CombinedIndex.getModuleHash(J.ModuleID),
+               [](uint32_t V) { return V == 0; }))
+      // Cache disabled or no entry for this module in the combined index or
+      // no module hash.
+      return Error::success();
+
+    const GVSummaryMapTy &DefinedGlobals =
+        ModuleToDefinedGVSummaries.find(J.ModuleID)->second;
+
+    // The module may be cached, this helps handling it.
+    J.CacheKey = computeLTOCacheKey(Conf, CombinedIndex, J.ModuleID, ImportList,
+                                    ExportList, ResolvedODR, DefinedGlobals,
+                                    CfiFunctionDefs, CfiFunctionDecls);
+
+    // The module may be cached, this helps handling it.
+    auto CacheAddStreamExp = Cache(J.Task, J.CacheKey, J.ModuleID);
+    if (Error Err = CacheAddStreamExp.takeError())
+      return Err;
+    AddStreamFn &CacheAddStream = *CacheAddStreamExp;
+    // If CacheAddStream is null, we have a cache hit and at this point
+    // object file is already passed back to the linker.
+    if (!CacheAddStream) {
+      J.Cached = true; // Cache hit, mark the job as cached.
+      CachedJobs.fetch_add(1);
+    } else {
+      // If CacheAddStream is not null, we have a cache miss and we need to
+      // run the backend for codegen. Save cache 'add stream'
+      // function for a later use.
+      J.CacheAddStream = std::move(CacheAddStream);
+    }
+    return Error::success();
   }
 
   Error start(
@@ -2288,22 +2345,27 @@ class OutOfProcessThinBackend : public CGThinBackend {
                                        itostr(Task) + "." + UID + ".native.o");
 
     Job &J = Jobs[Task - ThinLTOTaskOffset];
-    J = {
-        Task,
-        ModulePath,
-        Saver.save(ObjFilePath.str()),
-        Saver.save(ObjFilePath.str() + ".thinlto.bc"),
-        {} // Filled in by emitFiles below.
-    };
+    J = {Task,
+         ModulePath,
+         Saver.save(ObjFilePath.str()),
+         Saver.save(ObjFilePath.str() + ".thinlto.bc"),
+         {}, // Filled in by emitFiles below.
+         "", /*CacheKey=*/
+         nullptr,
+         false};
 
     assert(ModuleToDefinedGVSummaries.count(ModulePath));
 
     // The BackendThreadPool is only used here to write the sharded index files
     // (similar to WriteIndexesThinBackend).
     BackendThreadPool.async(
-        [=](Job &J, const FunctionImporter::ImportMapTy &ImportList) {
-          if (auto E = emitFiles(ImportList, J.ModuleID, J.ModuleID.str(),
-                                 J.SummaryIndexPath, J.ImportsFiles)) {
+        [=](Job &J, const FunctionImporter::ImportMapTy &ImportList,
+            const FunctionImporter::ExportSetTy &ExportList,
+            const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
+                &ResolvedODR) {
+          Error E =
+              runThinLTOBackendThread(J, ImportList, ExportList, ResolvedODR);
+          if (E) {
             std::unique_lock<std::mutex> L(ErrMu);
             if (Err)
               Err = joinErrors(std::move(*Err), std::move(E));
@@ -2311,7 +2373,8 @@ class OutOfProcessThinBackend : public CGThinBackend {
               Err = std::move(E);
           }
         },
-        std::ref(J), std::ref(ImportList));
+        std::ref(J), std::ref(ImportList), std::ref(ExportList),
+        std::ref(ResolvedODR));
 
     return Error::success();
   }
@@ -2405,6 +2468,10 @@ class OutOfProcessThinBackend : public CGThinBackend {
       JOS.attributeArray("jobs", [&]() {
         for (const auto &J : Jobs) {
           assert(J.Task != 0);
+          if (J.Cached) {
+            assert(!Cache.getCacheDirectoryPath().empty());
+            continue;
+          }
 
           SmallVector<StringRef, 2> Inputs;
           SmallVector<StringRef, 1> Outputs;
@@ -2477,20 +2544,28 @@ class OutOfProcessThinBackend : public CGThinBackend {
         removeFile(JsonFile);
     });
 
-    SmallVector<StringRef, 3> Args = {DistributorPath};
-    llvm::append_range(Args, DistributorArgs);
-    Args.push_back(JsonFile);
-    std::string ErrMsg;
-    if (sys::ExecuteAndWait(Args[0], Args,
-                            /*Env=*/std::nullopt, /*Redirects=*/{},
-                            /*SecondsToWait=*/0, /*MemoryLimit=*/0, &ErrMsg)) {
-      return make_error<StringError>(
-          BCError + "distributor execution failed" +
-              (!ErrMsg.empty() ? ": " + ErrMsg + Twine(".") : Twine(".")),
-          inconvertibleErrorCode());
+    // Checks if we have any jobs that don't have corresponding cache entries.
+    if (CachedJobs.load() < Jobs.size()) {
+      SmallVector<StringRef, 3> Args = {DistributorPath};
+      llvm::append_range(Args, DistributorArgs);
+      Args.push_back(JsonFile);
+      std::string ErrMsg;
+      if (sys::ExecuteAndWait(Args[0], Args,
+                              /*Env=*/std::nullopt, /*Redirects=*/{},
+                              /*SecondsToWait=*/0, /*MemoryLimit=*/0,
+                              &ErrMsg)) {
+        return make_error<StringError>(
+            BCError + "distributor execution failed" +
+                (!ErrMsg.empty() ? ": " + ErrMsg + Twine(".") : Twine(".")),
+            inconvertibleErrorCode());
+      }
     }
 
     for (auto &Job : Jobs) {
+      if (!Job.CacheKey.empty() && Job.Cached) {
+        assert(Cache.isValid());
+        continue;
+      }
       // Load the native object from a file into a memory buffer
       // and store its contents in the output buffer.
       auto ObjFileMbOrErr =
@@ -2501,15 +2576,35 @@ class OutOfProcessThinBackend : public CGThinBackend {
             BCError + "cannot open native object file: " +
                 Job.NativeObjectPath + ": " + EC.message(),
             inconvertibleErrorCode());
-      auto StreamOrErr = AddStream(Job.Task, Job.ModuleID);
-      if (Error Err = StreamOrErr.takeError())
-        report_fatal_error(std::move(Err));
-      auto &Stream = *StreamOrErr->get();
-      *Stream.OS << ObjFileMbOrErr->get()->getMemBufferRef().getBuffer();
-      if (Error Err = Stream.commit())
-        report_fatal_error(std::move(Err));
-    }
 
+      MemoryBufferRef ObjFileMbRef = ObjFileMbOrErr->get()->getMemBufferRef();
+      if (Cache.isValid()) {
+        // Cache hits are taken care of earlier. At this point, we could only
+        // have cache misses.
+        assert(Job.CacheAddStream);
+        // Obtain a file stream for a storing a cache entry.
+        auto CachedFileStreamOrErr = Job.CacheAddStream(Job.Task, Job.ModuleID);
+        if (!CachedFileStreamOrErr)
+          return joinErrors(
+              CachedFileStreamOrErr.takeError(),
+              createStringError(inconvertibleErrorCode(),
+                                "Cannot get a cache file stream: %s",
+                                Job.NativeObjectPath.data()));
+        // Store a file buffer into the cache stream.
+        auto &CacheStream = *(CachedFileStreamOrErr->get());
+        *(CacheStream.OS) << ObjFileMbRef.getBuffer();
+        if (Error Err = CacheStream.commit())
+          return Err;
+      } else {
+        auto StreamOrErr = AddStream(Job.Task, Job.ModuleID);
+        if (Error Err = StreamOrErr.takeError())
+          report_fatal_error(std::move(Err));
+        auto &Stream = *StreamOrErr->get();
+        *Stream.OS << ObjFileMbRef.getBuffer();
+        if (Error Err = Stream.commit())
+          report_fatal_error(std::move(Err));
+      }
+    }
     return Error::success();
   }
 };
@@ -2525,12 +2620,13 @@ ThinBackend lto::createOutOfProcessThinBackend(
   auto Func =
       [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
           const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-          AddStreamFn AddStream, FileCache /*Cache*/) {
+          AddStreamFn AddStream, FileCache Cache) {
         return std::make_unique<OutOfProcessThinBackend>(
             Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
-            AddStream, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles,
-            LinkerOutputFile, Distributor, DistributorArgs, RemoteCompiler,
-            RemoteCompilerPrependArgs, RemoteCompilerArgs, SaveTemps);
+            AddStream, Cache, OnWrite, ShouldEmitIndexFiles,
+            ShouldEmitImportsFiles, LinkerOutputFile, Distributor,
+            DistributorArgs, RemoteCompiler, RemoteCompilerPrependArgs,
+            RemoteCompilerArgs, SaveTemps);
       };
   return ThinBackend(Func, Parallelism);
 }
diff --git a/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll b/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll
new file mode 100644
index 0000000000000..df98c5e90b1ae
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/dtlto/dtlto-cache.ll
@@ -0,0 +1,74 @@
+; Test DTLTO output with llvm-lto2.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+; Generate bitcode files with summary.
+RUN: opt -thinlto-bc t1.ll -o t1.bc
+RUN: opt -thinlto-bc t2.ll -o t2.bc
+
+; Generate fake object files for mock.py to return.
+RUN: touch t1.o t2.o
+
+; Create an empty subdirectory to avoid having to account for the input files.
+RUN: mkdir %t/out && cd %t/out
+
+; Define a substitution to share the common DTLTO arguments with caching enabled.
+DEFINE: %{command} = llvm-lto2 run ../t1.bc ../t2.bc -o t.o -cache-dir cache-dir \
+DEFINE:   -dtlto-distributor=%python \
+DEFINE:   -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/mock.py,../t1.o,../t2.o \
+DEFINE:   -r=../t1.bc,t1,px \
+DEFINE:   -r=../t2.bc,t2,px
+
+; Perform out of process ThinLTO (DTLTO). 
+; Note: mock.py does not do any compilation, instead it simply writes
+; the contents of the object files supplied on the command line into the
+; output object files in job order.
+RUN: %{command}
+
+; Check that the expected output files have been created.
+RUN: ls | count 3
+; Check that two native object files has been created
+RUN: ls | FileCheck %s --check-prefix=THINLTO
+; Check that DTLTO cache directory has been created
+RUN: ls cache-dir/* | count 2
+; Check that 2 cache entries are created
+RUN: ls cache-dir/llvmcache-* | count 2
+
+; llvm-lto2 ThinLTO output files.
+THINLTO-DAG: {{^}}t.o.1{{$}}
+THINLTO-DAG: {{^}}t.o.2{{$}}
+
+# Execute llvm-lto2 again and check that a fully populated cache is used correctly, 
+# i.e., no additional cache entries are created for cache hits.
+
+RUN: %{command}
+
+; Check that the expected output files have been created.
+RUN: ls | count 3
+; Check that two native object files has been created
+RUN: ls | FileCheck %s --check-prefix=THINLTO
+; Check that DTLTO cache directory has been created
+RUN: ls cache-dir/* | count 2
+; Check that 2 cache entries are created
+RUN: ls cache-dir/llvmcache-* | count 2
+
+
+
+
+;--- t1.ll
+
+target triple = "x86_64-unknown-linux-gnu"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @t1() {
+  ret void
+}
+
+;--- t2.ll
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @t2() {
+  ret void
+}

From c2ddaaa4255cd4ab82eb9be6b1ac1842ec1e4edd Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2@gmail.com>
Date: Mon, 17 Nov 2025 20:28:02 +0800
Subject: [PATCH 007/105] [NFC][analyzer] Add missing documentation for
 `decodeValueOfObjCType` (#167822)

This check is introduced in
https://github.com/llvm/llvm-project/commit/b284005072122fe4af879725e3c8090009f89ca0,
but the documentation seems missing from `checkers.rst`.
---
 clang/docs/analyzer/checkers.rst | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index fd0b304cba0df..31edf9e99dc7d 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -198,7 +198,7 @@ as error. Specifically on x86/x86-64 target if the pointer address space is
 dereference is not defined as error. See `X86/X86-64 Language Extensions
 <https://clang.llvm.org/docs/LanguageExtensions.html#memory-references-to-specified-segments>`__
 for reference.
-	
+
 If the analyzer option ``suppress-dereferences-from-any-address-space`` is set
 to true (the default value), then this checker never reports dereference of
 pointers with a specified address space. If the option is set to false, then
@@ -1664,6 +1664,23 @@ Warn on uses of the 'bzero' function.
    bzero(ptr, n); // warn
  }
 
+.. _security-insecureAPI-decodeValueOfObjCType:
+
+security.insecureAPI.decodeValueOfObjCType (C)
+""""""""""""""""""""""""""""""""""""""""""""""
+Warn on uses of the Objective-C method ``-decodeValueOfObjCType:at:``.
+
+.. code-block:: objc
+
+  void test(NSCoder *decoder) {
+    unsigned int x;
+    [decoder decodeValueOfObjCType:"I" at:&x]; // warn
+  }
+
+This diagnostic is emitted only on Apple platforms where the safer
+``-decodeValueOfObjCType:at:size:`` alternative is available
+(iOS 11+, macOS 10.13+, tvOS 11+, watchOS 4.0+).
+
 .. _security-insecureAPI-getpw:
 
 security.insecureAPI.getpw (C)

From 515924f765407565efb65a70709b3f7d169366d0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 17 Nov 2025 12:38:51 +0000
Subject: [PATCH 008/105] [X86] bittest-big-integer.ll - add BLSR style pattern
 test (#168356)

Test using CTTZ to determine the lowest set bit, clear it and return the
index

Shows failure to use RMW pattern on the load-btr-store due to additional
(but non-interference) uses of the load.
---
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 615 +++++++++++++++++++
 1 file changed, 615 insertions(+)

diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 9b7569ff8b29f..b85a20b9d6b6e 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1488,3 +1488,618 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
   store i128 %res2, ptr %word
   ret i1 %cmp1
 }
+
+define i32 @blsr_u512(ptr %word) nounwind {
+; X86-LABEL: blsr_u512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl 12(%ebx), %esi
+; X86-NEXT:    movl 28(%ebx), %eax
+; X86-NEXT:    movl 60(%ebx), %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 44(%ebx), %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl 20(%ebx), %edx
+; X86-NEXT:    movl 52(%ebx), %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl 4(%ebx), %edi
+; X86-NEXT:    movl 36(%ebx), %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 24(%ebx), %edx
+; X86-NEXT:    movl 56(%ebx), %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl 8(%ebx), %ecx
+; X86-NEXT:    movl 40(%ebx), %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl 16(%ebx), %edx
+; X86-NEXT:    movl 48(%ebx), %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    movl 32(%ebx), %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    je .LBB26_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB26_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    jmp .LBB26_5
+; X86-NEXT:  .LBB26_1:
+; X86-NEXT:    movl $512, %ecx # imm = 0x200
+; X86-NEXT:    jmp .LBB26_41
+; X86-NEXT:  .LBB26_3:
+; X86-NEXT:    rep bsfl %ebx, %eax
+; X86-NEXT:  .LBB26_5: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB26_6
+; X86-NEXT:  # %bb.7: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    jmp .LBB26_8
+; X86-NEXT:  .LBB26_6:
+; X86-NEXT:    rep bsfl %ecx, %ecx
+; X86-NEXT:  .LBB26_8: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB26_10
+; X86-NEXT:  # %bb.9: # %cond.false
+; X86-NEXT:    addl $64, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB26_10: # %cond.false
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB26_11
+; X86-NEXT:  # %bb.12: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    je .LBB26_15
+; X86-NEXT:  .LBB26_14:
+; X86-NEXT:    rep bsfl %edx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    je .LBB26_17
+; X86-NEXT:    jmp .LBB26_18
+; X86-NEXT:  .LBB26_11:
+; X86-NEXT:    rep bsfl %esi, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB26_14
+; X86-NEXT:  .LBB26_15: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB26_18
+; X86-NEXT:  .LBB26_17: # %cond.false
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:  .LBB26_18: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    jne .LBB26_20
+; X86-NEXT:  # %bb.19: # %cond.false
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB26_20: # %cond.false
+; X86-NEXT:    addl $256, %eax # imm = 0x100
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    jne .LBB26_21
+; X86-NEXT:  # %bb.22: # %cond.false
+; X86-NEXT:    rep bsfl %edi, %ebx
+; X86-NEXT:    addl $32, %ebx
+; X86-NEXT:    jmp .LBB26_23
+; X86-NEXT:  .LBB26_21:
+; X86-NEXT:    rep bsfl %edx, %ebx
+; X86-NEXT:  .LBB26_23: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB26_24
+; X86-NEXT:  # %bb.25: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    je .LBB26_27
+; X86-NEXT:    jmp .LBB26_28
+; X86-NEXT:  .LBB26_24:
+; X86-NEXT:    rep bsfl %ecx, %ecx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    jne .LBB26_28
+; X86-NEXT:  .LBB26_27: # %cond.false
+; X86-NEXT:    addl $64, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:  .LBB26_28: # %cond.false
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB26_29
+; X86-NEXT:  # %bb.30: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    je .LBB26_33
+; X86-NEXT:  .LBB26_32:
+; X86-NEXT:    rep bsfl %edx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    je .LBB26_35
+; X86-NEXT:    jmp .LBB26_36
+; X86-NEXT:  .LBB26_29:
+; X86-NEXT:    rep bsfl %esi, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB26_32
+; X86-NEXT:  .LBB26_33: # %cond.false
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB26_36
+; X86-NEXT:  .LBB26_35: # %cond.false
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:  .LBB26_36: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    jne .LBB26_38
+; X86-NEXT:  # %bb.37: # %cond.false
+; X86-NEXT:    subl $-128, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:  .LBB26_38: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    jne .LBB26_40
+; X86-NEXT:  # %bb.39: # %cond.false
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:  .LBB26_40: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:  .LBB26_41: # %cond.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %esi, %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl 56(%edx), %edi
+; X86-NEXT:    movl 60(%edx), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%edx), %eax
+; X86-NEXT:    movl 44(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%edx), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%edx), %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%edx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%edx), %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebx), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ebx), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ebx), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 208(%esp,%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl (%ebx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 24(%ecx)
+; X86-NEXT:    movl %esi, 20(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 16(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 8(%ecx)
+; X86-NEXT:    movl %edi, 4(%ecx)
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 28(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 32(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 36(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 40(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 44(%ecx)
+; X86-NEXT:    movl %edx, 48(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 52(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 56(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 60(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: blsr_u512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    movq 56(%rdi), %rcx
+; SSE-NEXT:    movq 48(%rdi), %rdx
+; SSE-NEXT:    movq 40(%rdi), %rsi
+; SSE-NEXT:    movq 32(%rdi), %r11
+; SSE-NEXT:    movq 24(%rdi), %r8
+; SSE-NEXT:    movq 16(%rdi), %r9
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %r10
+; SSE-NEXT:    rep bsfq %rax, %rbx
+; SSE-NEXT:    rep bsfq %r10, %r14
+; SSE-NEXT:    addq $64, %r14
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovneq %rbx, %r14
+; SSE-NEXT:    rep bsfq %r9, %r15
+; SSE-NEXT:    rep bsfq %r8, %rbx
+; SSE-NEXT:    addq $64, %rbx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovneq %r15, %rbx
+; SSE-NEXT:    subq $-128, %rbx
+; SSE-NEXT:    movq %rax, %r15
+; SSE-NEXT:    movq %rax, %r12
+; SSE-NEXT:    orq %r10, %r12
+; SSE-NEXT:    cmovneq %r14, %rbx
+; SSE-NEXT:    rep bsfq %r11, %r12
+; SSE-NEXT:    rep bsfq %rsi, %r14
+; SSE-NEXT:    addq $64, %r14
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovneq %r12, %r14
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addq $64, %rax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %r12, %rax
+; SSE-NEXT:    subq $-128, %rax
+; SSE-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    orq %rsi, %r11
+; SSE-NEXT:    cmovneq %r14, %rax
+; SSE-NEXT:    addq $256, %rax # imm = 0x100
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    orq %r8, %r10
+; SSE-NEXT:    orq %r9, %r15
+; SSE-NEXT:    orq %r10, %r15
+; SSE-NEXT:    cmovneq %rbx, %rax
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $32, %ecx
+; SSE-NEXT:    movl %eax, %edx
+; SSE-NEXT:    andl $480, %edx # imm = 0x1E0
+; SSE-NEXT:    shrl $3, %edx
+; SSE-NEXT:    movl %edx, %esi
+; SSE-NEXT:    andl $-8, %esi
+; SSE-NEXT:    movq -128(%rsp,%rsi), %r8
+; SSE-NEXT:    shrq %cl, %r8
+; SSE-NEXT:    movl -120(%rsp,%rsi), %esi
+; SSE-NEXT:    addl %esi, %esi
+; SSE-NEXT:    notl %ecx
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    orl %r8d, %esi
+; SSE-NEXT:    btrl %eax, %esi
+; SSE-NEXT:    movl %esi, (%rdi,%rdx)
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: blsr_u512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 56(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %rdx
+; AVX2-NEXT:    movq 32(%rdi), %r11
+; AVX2-NEXT:    movq 24(%rdi), %rsi
+; AVX2-NEXT:    movq 16(%rdi), %r8
+; AVX2-NEXT:    movq (%rdi), %r9
+; AVX2-NEXT:    movq 8(%rdi), %r10
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    tzcntq %r10, %rax
+; AVX2-NEXT:    addq $64, %rax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovneq %rbx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    tzcntq %r8, %r14
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rsi, %rbx
+; AVX2-NEXT:    addq $64, %rbx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovneq %r14, %rbx
+; AVX2-NEXT:    subq $-128, %rbx
+; AVX2-NEXT:    movq %r9, %r14
+; AVX2-NEXT:    movq %r9, %r15
+; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    cmovneq %rax, %rbx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r11, %rax
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %rdx, %r12
+; AVX2-NEXT:    addq $64, %r12
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovneq %rax, %r12
+; AVX2-NEXT:    movq 48(%rdi), %r15
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r15, %r13
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    addq $64, %rax
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovneq %r13, %rax
+; AVX2-NEXT:    subq $-128, %rax
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    orq %rdx, %r11
+; AVX2-NEXT:    cmovneq %r12, %rax
+; AVX2-NEXT:    addq $256, %rax # imm = 0x100
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    orq %rsi, %r10
+; AVX2-NEXT:    orq %r8, %r14
+; AVX2-NEXT:    orq %r10, %r14
+; AVX2-NEXT:    cmovneq %rbx, %rax
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r15, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    andl $32, %ecx
+; AVX2-NEXT:    movl %eax, %edx
+; AVX2-NEXT:    andl $480, %edx # imm = 0x1E0
+; AVX2-NEXT:    shrl $3, %edx
+; AVX2-NEXT:    movl %edx, %esi
+; AVX2-NEXT:    andl $-8, %esi
+; AVX2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
+; AVX2-NEXT:    notl %ecx
+; AVX2-NEXT:    movl -120(%rsp,%rsi), %esi
+; AVX2-NEXT:    addl %esi, %esi
+; AVX2-NEXT:    shlxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    orl %r8d, %ecx
+; AVX2-NEXT:    btrl %eax, %ecx
+; AVX2-NEXT:    movl %ecx, (%rdi,%rdx)
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: blsr_u512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vmovups (%rdi), %ymm0
+; AVX512-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm2
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm3
+; AVX512-NEXT:    vpandnq %zmm3, %zmm2, %zmm3
+; AVX512-NEXT:    vplzcntq %zmm3, %zmm3
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
+; AVX512-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT:    vpcompressq %zmm3, %zmm2 {%k1}
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovdqu %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    andl $32, %ecx
+; AVX512-NEXT:    movl %ecx, %edx
+; AVX512-NEXT:    notl %edx
+; AVX512-NEXT:    movl %eax, %esi
+; AVX512-NEXT:    shrl $3, %esi
+; AVX512-NEXT:    movl %esi, %r8d
+; AVX512-NEXT:    andl $56, %r8d
+; AVX512-NEXT:    movl -120(%rsp,%r8), %r9d
+; AVX512-NEXT:    addl %r9d, %r9d
+; AVX512-NEXT:    shlxq %rdx, %r9, %rdx
+; AVX512-NEXT:    shrl $3, %ecx
+; AVX512-NEXT:    addq %rsp, %r8
+; AVX512-NEXT:    addq $-128, %r8
+; AVX512-NEXT:    orl (%rcx,%r8), %edx
+; AVX512-NEXT:    btrl %eax, %edx
+; AVX512-NEXT:    andl $60, %esi
+; AVX512-NEXT:    movl %edx, (%rdi,%rsi)
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rcx
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i512, ptr %word
+  %tz = tail call range(i512 0, 513) i512 @llvm.cttz.i512(i512 %ld, i1 false)
+  %tz.cast = trunc nuw nsw i512 %tz to i32
+  %tz.mask = and i512 %tz, 511
+  %mask = shl nuw i512 1, %tz.mask
+  %mask.not = xor i512 %mask, -1
+  %blsr = and i512 %ld, %mask.not
+  store i512 %blsr, ptr %word
+  ret i32 %tz.cast
+}

From 6b464e4ac0b1ce4638c0fa07abcba329119836cb Mon Sep 17 00:00:00 2001
From: Sergej Salnikov <skill@google.com>
Date: Mon, 17 Nov 2025 13:53:34 +0100
Subject: [PATCH 009/105] [clang][SourceManager] Use `getFileLoc` when
 computing `getPresumedLoc` (#166255)

Now the files location is used for macro expansions. This provides more
accurate location when reporting compilation errors.

Move from `getDecomposedExpansionLoc(Loc)` to
`getDecomposedLoc(getFileLoc(Loc))` when computing Presumed location.
---
 clang/include/clang/Basic/SourceManager.h           |  5 +++--
 clang/lib/Basic/SourceManager.cpp                   |  2 +-
 clang/test/Analysis/plist-macros-with-expansion.cpp |  8 ++++----
 clang/test/C/C23/n2350.c                            |  5 ++---
 clang/test/ExtractAPI/macro_undefined.c             |  4 ++--
 clang/test/FixIt/format.cpp                         |  8 ++++----
 clang/test/Preprocessor/macro_arg_directive.c       |  4 ++--
 clang/test/Preprocessor/print_line_track.c          | 11 +++++------
 8 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index bc9e97863556d..f15257a760b8c 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1464,8 +1464,9 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// directives.  This provides a view on the data that a user should see
   /// in diagnostics, for example.
   ///
-  /// Note that a presumed location is always given as the expansion point of
-  /// an expansion location, not at the spelling location.
+  /// If \p Loc is a macro expansion location, the presumed location
+  /// computation uses the spelling location for macro arguments and the
+  /// expansion location for other macro expansions.
   ///
   /// \returns The presumed location of the specified SourceLocation. If the
   /// presumed location cannot be calculated (e.g., because \p Loc is invalid
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index b6cc6ec9365f5..767a765ae4261 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -1435,7 +1435,7 @@ PresumedLoc SourceManager::getPresumedLoc(SourceLocation Loc,
   if (Loc.isInvalid()) return PresumedLoc();
 
   // Presumed locations are always for expansion points.
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(getFileLoc(Loc));
 
   bool Invalid = false;
   const SLocEntry &Entry = getSLocEntry(LocInfo.first, &Invalid);
diff --git a/clang/test/Analysis/plist-macros-with-expansion.cpp b/clang/test/Analysis/plist-macros-with-expansion.cpp
index d57bb0f2dd265..d9a2f94055593 100644
--- a/clang/test/Analysis/plist-macros-with-expansion.cpp
+++ b/clang/test/Analysis/plist-macros-with-expansion.cpp
@@ -405,14 +405,14 @@ void commaInBracketsTest() {
   code
 
 void commaInBracesTest() {
-  PASTE_CODE({ // expected-warning{{Dereference of null pointer}}
+  PASTE_CODE({
     // NOTE: If we were to add a new variable here after a comma, we'd get a
     // compilation error, so this test is mainly here to show that this was also
     // investigated.
     //
     // int *ptr = nullptr, a;
     int *ptr = nullptr;
-    *ptr = 5;
+    *ptr = 5; // expected-warning{{Dereference of null pointer}}
   })
 }
 
@@ -425,14 +425,14 @@ void commaInBracesTest() {
 // CHECK-NEXT:      <key>col</key><integer>3</integer>
 // CHECK-NEXT:      <key>file</key><integer>0</integer>
 // CHECK-NEXT:     </dict>
-// CHECK-NEXT:     <key>name</key><string>PASTE_CODE({ // expected-
+// CHECK-NEXT:     <key>name</key><string>PASTE_CODE({
 // CHECK-NEXT:    // NOTE: If we were to add a new variable here after a comma, we&apos;d get a
 // CHECK-NEXT:    // compilation error, so this test is mainly here to show that this was also
 // CHECK-NEXT:    // investigated.
 // CHECK-NEXT:    //
 // CHECK-NEXT:    // int *ptr = nullptr, a;
 // CHECK-NEXT:    int *ptr = nullptr;
-// CHECK-NEXT:    *ptr = 5;
+// CHECK-NEXT:    *ptr = 5; // expected-
 // CHECK-NEXT:  })</string>
 // CHECK-NEXT:     <key>expansion</key><string>{int *ptr =nullptr ;*ptr =5;}</string>
 // CHECK-NEXT:    </dict>
diff --git a/clang/test/C/C23/n2350.c b/clang/test/C/C23/n2350.c
index af0ca6d79be5e..96b8c511d5716 100644
--- a/clang/test/C/C23/n2350.c
+++ b/clang/test/C/C23/n2350.c
@@ -47,11 +47,10 @@ int struct_in_second_param(void) {
 
 int macro(void) {
   return offsetof(struct A // cpp-error {{'A' cannot be defined in a type specifier}} \
-                              expected-warning 2 {{defining a type within 'offsetof' is a C23 extension}}
+                              expected-warning {{defining a type within 'offsetof' is a C23 extension}}
   {
     int a;
-    struct B // verifier seems to think the error is emitted by the macro
-             // In fact the location of the error is "B" on the line above
+    struct B // expected-warning {{defining a type within 'offsetof' is a C23 extension}}
     {
       int c;
       int d;
diff --git a/clang/test/ExtractAPI/macro_undefined.c b/clang/test/ExtractAPI/macro_undefined.c
index 7bb50af380c24..1d697db1e1613 100644
--- a/clang/test/ExtractAPI/macro_undefined.c
+++ b/clang/test/ExtractAPI/macro_undefined.c
@@ -89,7 +89,7 @@ FUNC_GEN(bar, const int *, unsigned);
       },
       "location": {
         "position": {
-          "character": 0,
+          "character": 9,
           "line": 2
         },
         "uri": "file://INPUT_DIR/input.h"
@@ -241,7 +241,7 @@ FUNC_GEN(bar, const int *, unsigned);
       },
       "location": {
         "position": {
-          "character": 0,
+          "character": 9,
           "line": 3
         },
         "uri": "file://INPUT_DIR/input.h"
diff --git a/clang/test/FixIt/format.cpp b/clang/test/FixIt/format.cpp
index d663c0fb35e13..db642b60ffd95 100644
--- a/clang/test/FixIt/format.cpp
+++ b/clang/test/FixIt/format.cpp
@@ -56,9 +56,9 @@ void a(N::E NEVal, S *SPtr, S &SRef) {
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:17}:")"
 
-  LOG( // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
+  LOG(
       "%d",
-      SPtr->Type
+      SPtr->Type // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
   );
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:17}:")"
@@ -68,8 +68,8 @@ void a(N::E NEVal, S *SPtr, S &SRef) {
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:")"
 
-  LOG("%d", // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
-      SRef.Type);
+  LOG("%d",
+      SRef.Type); // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:")"
 
diff --git a/clang/test/Preprocessor/macro_arg_directive.c b/clang/test/Preprocessor/macro_arg_directive.c
index 929a03d70d025..c612aa545a2a9 100644
--- a/clang/test/Preprocessor/macro_arg_directive.c
+++ b/clang/test/Preprocessor/macro_arg_directive.c
@@ -18,7 +18,7 @@ void fail(const char *);
  ({ int result = 0; __VA_ARGS__; if (!result) { fail(#__VA_ARGS__); }; result })
 
 static inline int f(int k) {
-  return MUNCH( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{returning 'void'}} expected-note {{expansion of macro 'MUNCH' requested here}}
+  return MUNCH( // expected-note {{to match this '('}} expected-error {{returning 'void'}} expected-note {{expansion of macro 'MUNCH' requested here}}
     if (k < 3)
       result = 24;
     else if (k > 4)
@@ -27,6 +27,6 @@ static inline int f(int k) {
 
 #include "macro_arg_directive.h" // expected-error {{embedding a #include directive within macro arguments is not supported}}
 
-int g(int k) {
+int g(int k) { // expected-error {{expected ')'}}
   return f(k) + f(k-1));
 }
diff --git a/clang/test/Preprocessor/print_line_track.c b/clang/test/Preprocessor/print_line_track.c
index 156ae22693b85..56f30073e3e86 100644
--- a/clang/test/Preprocessor/print_line_track.c
+++ b/clang/test/Preprocessor/print_line_track.c
@@ -1,9 +1,9 @@
-/* RUN: %clang_cc1 -E %s | grep 'a 3'
- * RUN: %clang_cc1 -E %s | grep 'b 16'
- * RUN: %clang_cc1 -E -P %s | grep 'a 3'
- * RUN: %clang_cc1 -E -P %s | grep 'b 16'
+/* RUN: %clang_cc1 -E %s | grep -z 'a.3'
+ * RUN: %clang_cc1 -E %s | grep -z 'b.16'
+ * RUN: %clang_cc1 -E -P %s | grep -z 'a.3'
+ * RUN: %clang_cc1 -E -P %s | grep -z 'b.16'
  * RUN: %clang_cc1 -E %s | not grep '# 0 '
- * RUN: %clang_cc1 -E -P %s | count 2
+ * RUN: %clang_cc1 -E -P %s | count 4
  * PR1848 PR3437 PR7360
 */
 
@@ -14,4 +14,3 @@ t(a
 
 t(b
 __LINE__)
-

From e3cfb1710ed6380dc0e50bcf3c697d8977cb0037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20Benics?= <benicsbalazs@gmail.com>
Date: Mon, 17 Nov 2025 13:25:00 +0000
Subject: [PATCH 010/105] Update Clang Maintainers (#168271)

I've left Sonar by the end of October. For my upcoming contributions,
I'll simply use my personal (this) account.
I'll remain a Clang Static Analyser maintainer, but I'll likely spend
less time on that part as in my new job this falls out of my key
responsibilities.
From now on, I'm part of the Apple org, but for accessibility, I'll keep
using my personal email address for open-source contributions and for
the build bots.
---
 clang/Maintainers.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst
index 847d37d124083..8a5a7ed7c2a41 100644
--- a/clang/Maintainers.rst
+++ b/clang/Maintainers.rst
@@ -147,7 +147,6 @@ Clang static analyzer
 
 | Balázs Benics
 | benicsbalazs\@gmail.com (email), steakhal (Phabricator), steakhal (GitHub)
-| balazs.benics\@sonarsource.com (email), balazs-benics-sonarsource (GitHub)
 
 Compiler options
 ~~~~~~~~~~~~~~~~

From ae2fec04cc2cc771070d33437dd4b03560c49657 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <3920784+bchetioui@users.noreply.github.com>
Date: Mon, 17 Nov 2025 14:30:40 +0100
Subject: [PATCH 011/105] [mlir][bazel] Fix build after #167848. (#168366)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index effcd615786bf..153c7eeedd0ab 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9860,6 +9860,7 @@ cc_library(
     deps = [
         ":SparseTensorEnums",
         ":SparseTensorRuntime",
+        ":mlir_apfloat_utils",
         ":mlir_float16_utils",
         "//llvm:Support",
     ],

From e468ea3f40415c48281755c68548ba49480a2259 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@amd.com>
Date: Mon, 17 Nov 2025 08:34:21 -0500
Subject: [PATCH 012/105] [mlir][amdgpu] Fix documentation and verifiers
 (#167369)

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 35 +++++++++++++------
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 23 +++++++++---
 mlir/test/Dialect/AMDGPU/invalid.mlir         | 20 +++++++----
 3 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 45cb67f0eee4a..4820b7a747ac2 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPacked816Op
           FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
           ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
           ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
-          ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
+          ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<3>]>:$firstScaleByte)>,
       Results<(
           outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
                           FixedVectorOfShapeAndType<[8], F16>,
@@ -139,17 +139,21 @@ def AMDGPU_ScaledExtPacked816Op
   let summary = "Extend a vector of packed floating point values";
 
   let description = [{
-    The scales applied to the input microfloats are stored in two bytes which
+    The scales applied to the input microfloats are stored in bytes which
     come from the `scales` input provided in a *half* of the wave identified
-    by `firstScaleLane`. The pair of bytes used is selected by
-    `firstScaleByte`. The 16 vectors in consecutive lanes starting from
+    by `firstScaleLane`. The bytes used is selected by `firstScaleByte` and depends
+    on the type of `source`. The 16 vectors in consecutive lanes starting from
     `firstScaleLane` (which we'll call the scale vectors) will be used by both
-    halves of the wave (with lane L reading from L % 16'th scale vector), but
-    each half will use a different byte.
+    halves of the wave (with lane L reading from L % 16'th scale vector).
+
+    When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN each half of the
+    wave will use a different byte. The first one being `firstScaleByte` and
+    the second one being `firstScaleByte` + 1. When the block size is 32,
+    `firstScaleByte` can be either 0 or 2, selecting halves of the scale vectors.
+    Lanes 0-15 will read from `firstScaleByte` and lanes 16-31 will read
+    from `firstScaleByte` + 1.
+
 
-    When the block size is 32, `firstScaleByte` can be either 0 or 2,
-    selecting halves of the scale vectors. Lanes 0-15 will read from
-    `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
     For example:
     ```mlir
     // Input: 8-element vector of F8E4M3FN, converting to F32
@@ -165,7 +169,8 @@ def AMDGPU_ScaledExtPacked816Op
       : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
     ```
 
-    However, when the block size is 16, `firstScaleByte` can be 0 or 1.
+    When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN and
+    the block size is 16, `firstScaleByte` can be 0 or 1.
     Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
     while lanes 16-31 read from `firstScaleByte` + 2.
     For example:
@@ -187,6 +192,16 @@ def AMDGPU_ScaledExtPacked816Op
     instructions use for matix scales. These selection operands allows
     one to choose portions of the matrix to convert.
 
+    When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 32,
+    then the same byte will be used by both halves of the wave.
+    In this case, `firstScaleByte` can be any value from 0 to 3.
+
+    When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 16,
+    following combinations are allowed:
+    * `firstScaleLane(0), firstScaleByte(0)`
+    * `firstScaleLane(1), firstScaleByte(2)`
+    all other combinations are reserved.
+
     Available on gfx1250+.
   }];
 
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index df955fc90b45f..5c35823678576 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -344,14 +344,27 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
 LogicalResult ScaledExtPacked816Op::verify() {
   int blockSize = getBlockSize();
   assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+
   int firstScaleByte = getFirstScaleByte();
-  if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) {
-    return emitOpError(
-        "blockSize of 16 can only have firstScaleByte be 0 or 1.");
+  auto sourceType = cast<VectorType>(getSource().getType());
+  Type elementType = sourceType.getElementType();
+  auto floatType = cast<FloatType>(elementType);
+  int bitWidth = floatType.getWidth();
+
+  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 &&
+      !llvm::is_contained({0, 1}, firstScaleByte)) {
+    return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 "
+                       "for f4 and f6.");
+  }
+  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 &&
+      !llvm::is_contained({0, 2}, firstScaleByte)) {
+    return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 "
+                       "for f4 and f6.");
   }
-  if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) {
+  if (bitWidth == 8 && blockSize == 16 &&
+      !llvm::is_contained({0, 2}, firstScaleByte)) {
     return emitOpError(
-        "blockSize of 32 can only have firstScaleByte be 0 or 2.");
+        "blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.");
   }
 
   return success();
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 4c6f62a045405..5c8cc8b67c4b3 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -333,17 +333,25 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
 
 // -----
 
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
   func.return
 }
 
 // -----
 
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
   func.return
 }
 

From ef023cae388d7becd18df602cb2d77bdb3d59e55 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 17 Nov 2025 13:44:25 +0000
Subject: [PATCH 013/105] Reland [VPlan] Expand WidenInt inductions with
 nuw/nsw (#168354)

Changes: The previous patch had to be reverted to a mismatching-OpType
assert in cse. The reduced-test has now been added corresponding to a
RVV pointer-induction, and the pointer-induction case has been updated
to use createOverflowingBinaryOp.

While at it, record VPIRFlags in VPWidenInductionRecipe.
---
 flang/test/Integration/unroll-loops.f90       |   2 +-
 flang/test/Lower/HLFIR/unroll-loops.fir       |   2 +-
 .../Vectorize/LoopVectorizationPlanner.h      |   9 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  15 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  14 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  21 ++-
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    |  13 ++
 .../AArch64/clamped-trip-count.ll             |   8 +-
 .../AArch64/conditional-branches-cost.ll      |   2 +-
 .../AArch64/epilog-iv-select-cmp.ll           |  22 +--
 .../epilog-vectorization-widen-inductions.ll  |  36 ++---
 .../LoopVectorize/AArch64/optsize_minsize.ll  |  22 +--
 .../AArch64/outer_loop_prefer_scalable.ll     |   6 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |   4 +-
 .../LoopVectorize/AArch64/predicated-costs.ll |   2 +-
 .../AArch64/scalable-avoid-scalarization.ll   |   6 +-
 ...ng-compatible-sve-no-maximize-bandwidth.ll |   4 +-
 .../LoopVectorize/AArch64/sve-inductions.ll   |   2 +-
 .../AArch64/sve-interleaved-accesses.ll       |  40 ++---
 .../sve-interleaved-masked-accesses.ll        |   8 +-
 .../LoopVectorize/AArch64/sve-tail-folding.ll |   8 +-
 .../AArch64/sve2-histcnt-too-many-deps.ll     |   2 +-
 .../LoopVectorize/AArch64/sve2-histcnt.ll     |   2 +-
 .../ARM/mve-gather-scatter-tailpred.ll        |  14 +-
 .../ARM/tail-folding-not-allowed.ll           |   2 +-
 .../RISCV/blocks-with-dead-instructions.ll    |  48 +++---
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |   8 +-
 .../RISCV/evl-compatible-loops.ll             |   4 +-
 .../RISCV/interleaved-masked-access.ll        |   4 +-
 .../RISCV/interleaved-store-with-gap.ll       |   2 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |   4 +-
 .../RISCV/masked_gather_scatter.ll            |  16 +-
 .../LoopVectorize/RISCV/pointer-induction.ll  |  64 ++++++++
 .../LoopVectorize/RISCV/safe-dep-distance.ll  |   2 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |  26 ++--
 .../RISCV/tail-folding-cast-intrinsics.ll     |  10 +-
 .../RISCV/tail-folding-cond-reduction.ll      |  48 +++---
 .../tail-folding-fixed-order-recurrence.ll    |  10 +-
 .../RISCV/tail-folding-interleave.ll          |  56 +++----
 .../LoopVectorize/RISCV/uniform-load-store.ll |  28 ++--
 .../X86/CostModel/vpinstruction-cost.ll       |   4 +-
 .../X86/consecutive-ptr-uniforms.ll           |   2 +-
 ...bounds-flags-for-reverse-vector-pointer.ll |   2 +-
 .../X86/epilog-vectorization-inductions.ll    |  28 ++--
 .../X86/fixed-order-recurrence.ll             |   2 +-
 .../LoopVectorize/X86/gather_scatter.ll       |  20 +--
 .../LoopVectorize/X86/interleave-cost.ll      |   2 +-
 ...rleaved-accesses-sink-store-across-load.ll |   4 +-
 .../LoopVectorize/X86/masked_load_store.ll    |   2 +-
 .../Transforms/LoopVectorize/X86/optsize.ll   |   4 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |   2 +-
 .../Transforms/LoopVectorize/X86/pr36524.ll   |   2 +-
 ...6-sunk-instruction-used-outside-of-loop.ll |   2 +-
 .../X86/pr55096-scalarize-add.ll              |   2 +-
 .../Transforms/LoopVectorize/X86/pr81872.ll   |   2 +-
 .../LoopVectorize/X86/scatter_crash.ll        |  36 ++---
 ...-narrow-interleave-to-widen-memory-gaps.ll |   2 +-
 ...ned-value-used-as-scalar-and-first-lane.ll |   2 +-
 .../x86-interleaved-accesses-masked-group.ll  |  30 ++--
 ...86-interleaved-store-accesses-with-gaps.ll |   6 +-
 .../LoopVectorize/X86/x86-predication.ll      |   2 +-
 llvm/test/Transforms/LoopVectorize/assume.ll  |   2 +-
 .../LoopVectorize/check-prof-info.ll          |  32 ++--
 .../LoopVectorize/consecutive-ptr-uniforms.ll |   4 +-
 .../cse-gep-source-element-type.ll            |   2 +-
 ...able-info-from-assumption-constant-size.ll | 116 +++++++++------
 .../LoopVectorize/epilog-iv-select-cmp.ll     |  24 +--
 .../epilog-vectorization-reductions.ll        |  12 +-
 .../first-order-recurrence-chains.ll          |  15 +-
 .../LoopVectorize/first-order-recurrence.ll   |   8 +-
 .../LoopVectorize/induction-step.ll           |  30 ++--
 .../Transforms/LoopVectorize/induction.ll     | 124 ++++++++--------
 .../instruction-only-used-outside-of-loop.ll  |   4 +-
 .../interleaved-accesses-gep-nowrap-flags.ll  |  36 +++--
 .../LoopVectorize/interleaved-accesses.ll     |  22 +--
 .../LoopVectorize/iv-select-cmp-decreasing.ll |  16 +-
 .../LoopVectorize/iv-select-cmp-no-wrap.ll    |   4 +-
 .../LoopVectorize/iv-select-cmp-trunc.ll      |  16 +-
 .../Transforms/LoopVectorize/iv-select-cmp.ll |  36 ++---
 .../LoopVectorize/iv_outside_user.ll          |   6 +-
 .../LoopVectorize/load-deref-pred-align.ll    |   8 +-
 .../Transforms/LoopVectorize/loop-scalars.ll  |   2 +-
 .../LoopVectorize/no_outside_user.ll          |  84 +++++------
 .../LoopVectorize/noalias-scope-decl.ll       |   2 +-
 .../optimal-epilog-vectorization.ll           |  16 +-
 llvm/test/Transforms/LoopVectorize/optsize.ll |   2 +-
 .../outer-loop-inner-latch-successors.ll      |   4 +-
 .../outer-loop-vec-phi-predecessor-order.ll   |   2 +-
 .../LoopVectorize/outer-loop-wide-phis.ll     |   4 +-
 .../outer_loop_hcfg_construction.ll           |   4 +-
 .../LoopVectorize/outer_loop_scalable.ll      |   6 +-
 .../LoopVectorize/outer_loop_test1.ll         |   2 +-
 .../LoopVectorize/outer_loop_test2.ll         |   2 +-
 .../pr30654-phiscev-sext-trunc.ll             |  24 +--
 llvm/test/Transforms/LoopVectorize/pr34681.ll |   4 +-
 llvm/test/Transforms/LoopVectorize/pr35773.ll |   4 +-
 .../LoopVectorize/pr36983-multiple-lcssa.ll   |   2 +-
 .../pr39417-optsize-scevchecks.ll             |   7 +-
 .../pr55167-fold-tail-live-out.ll             |   2 +-
 .../preserve-dbg-loc-and-loop-metadata.ll     |   8 +-
 .../LoopVectorize/reduction-small-size.ll     |   2 +-
 .../reduction-with-invariant-store.ll         |   4 +-
 .../LoopVectorize/scalable-assume.ll          |   6 +-
 .../LoopVectorize/scalable-inductions.ll      |  10 +-
 .../LoopVectorize/single-value-blend-phis.ll  |  14 +-
 .../LoopVectorize/uitofp-preserve-nneg.ll     |   2 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |   2 +-
 .../uniform_across_vf_induction1.ll           |  24 +--
 .../uniform_across_vf_induction1_and.ll       |  16 +-
 .../uniform_across_vf_induction1_div_urem.ll  |   6 +-
 .../uniform_across_vf_induction1_lshr.ll      |  26 ++--
 .../uniform_across_vf_induction2.ll           | 140 +++++++++---------
 .../Transforms/LoopVectorize/vector-geps.ll   |   2 +-
 .../vplan-printing-reductions.ll              |   2 +-
 .../LoopVectorize/vplan-printing.ll           |   4 +-
 .../vplan-sink-scalars-and-merge.ll           |   4 +-
 .../vplan-vectorize-inner-loop-reduction.ll   |   2 +-
 .../vplan-widen-call-instruction.ll           |   2 +-
 .../vplan-widen-select-instruction.ll         |   8 +-
 .../widen-gep-all-indices-invariant.ll        |   2 +-
 121 files changed, 934 insertions(+), 800 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll

diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
index 87ab9efeb703b..2c4a3495eb0b7 100644
--- a/flang/test/Integration/unroll-loops.f90
+++ b/flang/test/Integration/unroll-loops.f90
@@ -25,7 +25,7 @@ subroutine unroll(a)
     ! NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
     ! NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
     ! NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
-    ! NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+    ! NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
     !
     ! UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
     ! UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
diff --git a/flang/test/Lower/HLFIR/unroll-loops.fir b/flang/test/Lower/HLFIR/unroll-loops.fir
index 89e8ce82d6f3f..1ccb6b1bd0315 100644
--- a/flang/test/Lower/HLFIR/unroll-loops.fir
+++ b/flang/test/Lower/HLFIR/unroll-loops.fir
@@ -27,7 +27,7 @@ func.func @unroll(%arg0: !fir.ref<!fir.array<1000 x index>> {fir.bindc_name = "a
     // NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
     // NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
     // NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
-    // NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+    // NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
 
     // UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
     // UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 04b05627fa769..5dc3175382254 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -178,11 +178,10 @@ class VPBuilder {
         new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
   }
 
-  VPInstruction *createOverflowingOp(unsigned Opcode,
-                                     ArrayRef<VPValue *> Operands,
-                                     VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
-                                     DebugLoc DL = DebugLoc::getUnknown(),
-                                     const Twine &Name = "") {
+  VPInstruction *createOverflowingOp(
+      unsigned Opcode, ArrayRef<VPValue *> Operands,
+      VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false},
+      DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cbfbc29360b0b..10bd6cd471152 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7639,6 +7639,10 @@ createWidenInductionRecipes(VPInstruction *PhiR,
   assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
          "Start VPValue must match IndDesc's start value");
 
+  // It is always safe to copy over the NoWrap and FastMath flags. In
+  // particular, when folding tail by masking, the masked-off lanes are never
+  // used, so it is safe.
+  VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
   VPValue *Step =
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
 
@@ -7651,7 +7655,7 @@ createWidenInductionRecipes(VPInstruction *PhiR,
 
   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
-                                           IndDesc, PhiR->getDebugLoc());
+                                           IndDesc, Flags, PhiR->getDebugLoc());
 }
 
 VPHeaderPHIRecipe *
@@ -7705,10 +7709,15 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
   PHINode *Phi = WidenIV->getPHINode();
   VPValue *Start = WidenIV->getStartValue();
   const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
+
+  // It is always safe to copy over the NoWrap and FastMath flags. In
+  // particular, when folding tail by masking, the masked-off lanes are never
+  // used, so it is safe.
+  VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
   VPValue *Step =
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
-  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
-                                           IndDesc, I, VPI->getDebugLoc());
+  return new VPWidenIntOrFpInductionRecipe(
+      Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
 }
 
 VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 13131a2b61722..0932922c07126 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2151,7 +2151,8 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector values. This is an abstract recipe and must be
 /// converted to concrete recipes before executing.
-class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
+class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe,
+                                      public VPIRFlags {
   TruncInst *Trunc;
 
   // If this recipe is unrolled it will have 2 additional operands.
@@ -2160,19 +2161,20 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 VPValue *VF, const InductionDescriptor &IndDesc,
-                                DebugLoc DL)
+                                const VPIRFlags &Flags, DebugLoc DL)
       : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
                                Step, IndDesc, DL),
-        Trunc(nullptr) {
+        VPIRFlags(Flags), Trunc(nullptr) {
     addOperand(VF);
   }
 
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 VPValue *VF, const InductionDescriptor &IndDesc,
-                                TruncInst *Trunc, DebugLoc DL)
+                                TruncInst *Trunc, const VPIRFlags &Flags,
+                                DebugLoc DL)
       : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
                                Step, IndDesc, DL),
-        Trunc(Trunc) {
+        VPIRFlags(Flags), Trunc(Trunc) {
     addOperand(VF);
     SmallVector<std::pair<unsigned, MDNode *>> Metadata;
     (void)Metadata;
@@ -2186,7 +2188,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   VPWidenIntOrFpInductionRecipe *clone() override {
     return new VPWidenIntOrFpInductionRecipe(
         getPHINode(), getStartValue(), getStepValue(), getVFValue(),
-        getInductionDescriptor(), Trunc, getDebugLoc());
+        getInductionDescriptor(), Trunc, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cf95b4eac9d75..e2a8e495d5ed5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2387,7 +2387,9 @@ void VPWidenIntOrFpInductionRecipe::printRecipe(
     raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
   O << Indent;
   printAsOperand(O, SlotTracker);
-  O << " = WIDEN-INDUCTION  ";
+  O << " = WIDEN-INDUCTION";
+  printFlags(O);
+  O << " ";
   printOperands(O, SlotTracker);
 
   if (auto *TI = getTruncInst())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb61308cb7d9..bbeb447de45cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -76,8 +76,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
           VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
           VPValue *Step =
               vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
+          // It is always safe to copy over the NoWrap and FastMath flags. In
+          // particular, when folding tail by masking, the masked-off lanes are
+          // never used, so it is safe.
+          VPIRFlags Flags = vputils::getFlagsFromIndDesc(*II);
           NewRecipe = new VPWidenIntOrFpInductionRecipe(
-              Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
+              Phi, Start, Step, &Plan.getVF(), *II, Flags,
+              Ingredient.getDebugLoc());
         }
       } else {
         assert(isa<VPInstruction>(&Ingredient) &&
@@ -542,6 +547,11 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
     // only.
     if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
         vputils::onlyFirstLaneUsed(WidenNewIV)) {
+      // We are replacing a wide canonical iv with a suitable wide induction.
+      // This is used to compute header mask, hence all lanes will be used and
+      // we need to drop wrap flags only applying to lanes guranteed to execute
+      // in the original scalar loop.
+      WidenOriginalIV->dropPoisonGeneratingFlags();
       WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
       WidenNewIV->eraseFromParent();
       return;
@@ -3285,16 +3295,13 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
   const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
   Instruction::BinaryOps AddOp;
   Instruction::BinaryOps MulOp;
-  // FIXME: The newly created binary instructions should contain nsw/nuw
-  // flags, which can be found from the original scalar operations.
-  VPIRFlags Flags;
+  VPIRFlags Flags = *WidenIVR;
   if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
     AddOp = Instruction::Add;
     MulOp = Instruction::Mul;
   } else {
     AddOp = ID.getInductionOpcode();
     MulOp = Instruction::FMul;
-    Flags = ID.getInductionBinOp()->getFastMathFlags();
   }
 
   // If the phi is truncated, truncate the start and step values.
@@ -3406,7 +3413,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
   Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
   Type *StepTy = TypeInfo.inferScalarType(Step);
   VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
-  Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
+  Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
   VPValue *PtrAdd = Builder.createNaryOp(
       VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
   R->replaceAllUsesWith(PtrAdd);
@@ -3416,7 +3423,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
   Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
   VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
                                        DL);
-  VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
+  VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
 
   VPValue *InductionGEP =
       Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index df1613d760a04..51bafe0846141 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -73,6 +73,19 @@ std::optional<VPValue *>
 getRecipesForUncountableExit(VPlan &Plan,
                              SmallVectorImpl<VPRecipeBase *> &Recipes,
                              SmallVectorImpl<VPRecipeBase *> &GEPs);
+
+/// Extracts and returns NoWrap and FastMath flags from the induction binop in
+/// \p ID.
+inline VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID) {
+  if (ID.getKind() == InductionDescriptor::IK_FpInduction)
+    return ID.getInductionBinOp()->getFastMathFlags();
+
+  if (auto *OBO = dyn_cast_if_present<OverflowingBinaryOperator>(
+          ID.getInductionBinOp()))
+    return VPIRFlags::WrapFlagsTy(OBO->hasNoUnsignedWrap(),
+                                  OBO->hasNoSignedWrap());
+  return {};
+}
 } // namespace vputils
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 0415b01d78b46..ac8095ae5c3e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -14,8 +14,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -76,8 +76,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 2f7e3568d5654..cb4bd793013b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1052,7 +1052,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 ; DEFAULT-NEXT:    store i32 [[TMP2]], ptr [[DST]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; DEFAULT-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index a49f089bd2085..2180f18750bf2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -35,7 +35,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[TMP10]] = select <16 x i1> [[TMP17]], <16 x i8> [[VEC_IND]], <16 x i8> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11]] = select <16 x i1> [[TMP23]], <16 x i8> [[STEP_ADD]], <16 x i8> [[VEC_PHI2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[STEP_ADD]], splat (i8 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i8> [[STEP_ADD]], splat (i8 16)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -48,7 +48,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -62,11 +62,11 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = trunc i32 [[INDEX6]] to i8
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
@@ -74,9 +74,9 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD12]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP20]] = select <8 x i1> [[TMP19]], <8 x i8> [[VEC_IND7]], <8 x i8> [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i32 [[INDEX6]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT8]] = add <8 x i8> [[VEC_IND7]], splat (i8 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT13]] = add nuw nsw <8 x i8> [[VEC_IND7]], splat (i8 8)
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT13]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP21]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]])
 ; CHECK-NEXT:    [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128
@@ -96,7 +96,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[SEL]] = select i1 [[C]], i8 [[IV1]], i8 [[RDX]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i8 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[SEL_LCSSA]]
@@ -158,7 +158,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[RDX_MINMAX5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP5]])
@@ -170,7 +170,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -197,7 +197,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]])
 ; CHECK-NEXT:    [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648
@@ -216,7 +216,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[RED_NEXT]] = select i1 [[C]], i32 [[IV_TRUNC]], i32 [[RED]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index d23e3c29b59e5..3010a9d75d039 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -46,7 +46,7 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END4:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
@@ -128,7 +128,7 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       middle.block:
@@ -136,14 +136,14 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 2
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -151,7 +151,7 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX5]]
 ; CHECK-NEXT:    store <2 x i64> [[VEC_IND6]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add <2 x i64> [[VEC_IND6]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add nuw nsw <2 x i64> [[VEC_IND6]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
@@ -200,7 +200,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -212,7 +212,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       middle.block:
@@ -221,7 +221,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -230,7 +230,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[IND_END4:%.*]] = add i64 [[START]], [[N_VEC3]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT8]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION10:%.*]] = add <2 x i64> [[DOTSPLAT9]], <i64 0, i64 1>
+; CHECK-NEXT:    [[INDUCTION10:%.*]] = add nuw nsw <2 x i64> [[DOTSPLAT9]], <i64 0, i64 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -239,7 +239,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX13]]
 ; CHECK-NEXT:    store <2 x i64> [[VEC_IND11]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <2 x i64> [[VEC_IND11]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add nuw nsw <2 x i64> [[VEC_IND11]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
@@ -296,7 +296,7 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IND_END4]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       middle.block:
@@ -304,14 +304,14 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -320,7 +320,7 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[VEC_IND8]], splat (i64 10)
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <2 x i64> [[VEC_IND8]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add nuw nsw <2 x i64> [[VEC_IND8]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[IND_END]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
@@ -410,19 +410,19 @@ define void @test_widen_truncated_induction(ptr %A) {
 ; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP1]], align 1
 ; CHECK-NEXT:    store <2 x i8> [[STEP_ADD]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], splat (i8 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i8> [[STEP_ADD]], splat (i8 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[VEC_EPILOG_RESUME_VAL]] to i8
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -430,7 +430,7 @@ define void @test_widen_truncated_induction(ptr %A) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX2]]
 ; CHECK-NEXT:    store <2 x i8> [[VEC_IND3]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i8> [[VEC_IND3]], splat (i8 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add nuw nsw <2 x i8> [[VEC_IND3]], splat (i8 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 10000
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index f50d0834c5dc8..75f256085a17a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -198,7 +198,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
 ; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ]
-; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ]
+; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT36:%.*]], %[[PRED_STORE_CONTINUE35]] ]
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
@@ -353,7 +353,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
-; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
+; DEFAULT-NEXT:    [[VEC_IND_NEXT36]] = add nuw nsw <16 x i8> [[VEC_IND1]], splat (i8 16)
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
@@ -454,8 +454,8 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
-; DEFAULT-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
-; DEFAULT-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i8> [[TMP10]], splat (i8 1)
+; DEFAULT-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP1]] to i8
 ; DEFAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP12]], i64 0
 ; DEFAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -477,7 +477,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP24]], true
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
 ; DEFAULT-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
@@ -504,8 +504,8 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
-; OPTSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
-; OPTSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
+; OPTSIZE-NEXT:    [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i8> [[TMP10]], splat (i8 1)
+; OPTSIZE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; OPTSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP1]] to i8
 ; OPTSIZE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP12]], i64 0
 ; OPTSIZE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -527,7 +527,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; OPTSIZE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; OPTSIZE-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP24]], true
-; OPTSIZE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; OPTSIZE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
 ; OPTSIZE-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
@@ -554,8 +554,8 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
-; MINSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
-; MINSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
+; MINSIZE-NEXT:    [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i8> [[TMP10]], splat (i8 1)
+; MINSIZE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; MINSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP1]] to i8
 ; MINSIZE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP12]], i64 0
 ; MINSIZE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -577,7 +577,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; MINSIZE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; MINSIZE-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP24]], true
-; MINSIZE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
+; MINSIZE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i8> [[VEC_IND]], [[DOTSPLAT]]
 ; MINSIZE-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index 5b61fba4ae994..5f8d7e7d24cc4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -19,8 +19,8 @@ define void @foo() {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP4]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -44,7 +44,7 @@ define void @foo() {
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <vscale x 4 x float> [ [[TMP10]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
index 6d0777e42ab0e..2edbec4681ab0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -51,7 +51,7 @@ define void @foo_i32(i32 %n) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR_BODY31]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -113,7 +113,7 @@ define void @foo_i64(i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[VECTOR_LATCH]], label %[[FOR_BODY31]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 1dcd665817196..f67a3d9be408a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -320,7 +320,7 @@ define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %e
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; CHECK:       [[PRED_STORE_CONTINUE12]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index 2521ece2eea06..e338b828d2520 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -27,8 +27,8 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[IDX]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP8]], splat (i32 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <vscale x 2 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 2 x i32> [[DOTSPLAT]], [[TMP9]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -42,7 +42,7 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
 ; CHECK-NEXT:    store <vscale x 2 x double> [[WIDE_LOAD]], ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[DOTSPLAT2]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 2 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
index 1213d974e75ef..f8be8d5b62031 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
@@ -43,7 +43,7 @@ define i32 @foo(i32 noundef %n, i32 noundef %lag, i32 noundef %shift) vscale_ran
 ; SC_SVE-NEXT:    [[TMP16:%.*]] = shl <4 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; SC_SVE-NEXT:    [[TMP17]] = add <4 x i32> [[TMP16]], [[VEC_PHI]]
 ; SC_SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SC_SVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; SC_SVE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; SC_SVE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SC_SVE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SC_SVE:       middle.block:
@@ -114,7 +114,7 @@ define i32 @foo(i32 noundef %n, i32 noundef %lag, i32 noundef %shift) vscale_ran
 ; NO_SC_SVE-NEXT:    [[TMP16:%.*]] = shl <8 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; NO_SC_SVE-NEXT:    [[TMP17]] = add <8 x i32> [[TMP16]], [[VEC_PHI]]
 ; NO_SC_SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; NO_SC_SVE-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; NO_SC_SVE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; NO_SC_SVE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO_SC_SVE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO_SC_SVE:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
index 3c0455938be80..337c097c85712 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -36,7 +36,7 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 28d2a278de498..e90f8d09fc7ab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -106,7 +106,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -128,7 +128,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -190,7 +190,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -214,7 +214,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr @CD_i16, <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> align 2 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
@@ -291,7 +291,7 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
 ; CHECK-NEXT:    [[TMP16]] = sub <vscale x 4 x i32> [[TMP12]], [[TMP15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
@@ -391,7 +391,7 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
@@ -595,7 +595,7 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> align 8 [[TMP5]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP7]], <vscale x 4 x ptr> align 8 [[TMP6]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
@@ -821,7 +821,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
@@ -894,7 +894,7 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
@@ -968,7 +968,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1044,7 +1044,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1111,7 +1111,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1125,7 +1125,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1188,8 +1188,8 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP21]], splat (i64 3)
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP10]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> [[TMP19]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP7]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1206,7 +1206,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1273,7 +1273,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNOT]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <vscale x 4 x i64> [[TMP14]], splat (i64 1)
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP14]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP9]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1301,7 +1301,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP28]], align 4, !alias.scope [[META37:![0-9]+]], !noalias [[META34]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1394,7 +1394,7 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 8
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1590,7 +1590,7 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE8]], <vscale x 4 x i32> [[REVERSE9]])
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 02cc499f18827..82dff2f8aa2f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -60,7 +60,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -198,7 +198,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -328,7 +328,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP11]], <vscale x 16 x i1> [[TMP8]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -483,7 +483,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 945d808d3fa3f..8cc9e431e6214 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -164,9 +164,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul <vscale x 4 x i64> [[TMP13]], splat (i64 4)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 4, [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <vscale x 4 x i64> [[TMP13]], splat (i64 4)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw i64 4, [[TMP4]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP18]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -182,7 +182,7 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
index baf050c7facee..0617d2937f824 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll
@@ -90,7 +90,7 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD10]], [[VEC_IND]]
 ; NORMAL_DEP_LIMIT-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP14]], align 4, !alias.scope [[META7]], !noalias [[META0]]
 ; NORMAL_DEP_LIMIT-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP8]]
-; NORMAL_DEP_LIMIT-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NORMAL_DEP_LIMIT-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; NORMAL_DEP_LIMIT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; NORMAL_DEP_LIMIT-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NORMAL_DEP_LIMIT:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 871d9be609bd7..ca4faf4a0a1c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -625,7 +625,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index 0b0e2d4154cb6..9f62c7dcda65a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -351,8 +351,8 @@ define void @test_stride_noninvar_4i32(ptr readonly %data, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 32)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add nuw nsw <4 x i32> [[VEC_IND2]], splat (i32 32)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
@@ -452,9 +452,9 @@ define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 3, [[TMP0]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> splat (i32 3), [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[X]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> splat (i32 3), [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i32 [[X]], 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -470,8 +470,8 @@ define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add nuw nsw <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index 4af40b711726d..9ea95658818fe 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -508,7 +508,7 @@ define dso_local void @select_not_allowed(ptr noalias nocapture %A, ptr noalias
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 2087218bf3ea3..adf443b74acf1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -14,21 +14,21 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <vscale x 8 x i64> [[TMP11]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP17]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 3, [[TMP17]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -69,21 +69,21 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 8 x i64> [[TMP5]], splat (i64 3)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <vscale x 8 x i64> [[TMP5]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 333, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP10]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -124,21 +124,21 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 8 x i64> [[TMP5]], splat (i64 3)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <vscale x 8 x i64> [[TMP5]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 333, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP10]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -186,21 +186,21 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <vscale x 8 x i64> [[TMP11]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP17]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 3, [[TMP17]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -248,21 +248,21 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 8 x i64> [[TMP5]], splat (i64 3)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <vscale x 8 x i64> [[TMP5]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 333, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP10]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -319,15 +319,15 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i64> [[TMP11]], splat (i64 3)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <vscale x 8 x i64> [[TMP11]], splat (i64 3)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 3, [[TMP12]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
@@ -340,7 +340,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index b81637f50989d..9f6f79d9030ed 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -354,8 +354,8 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP9]], splat (i64 2)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <vscale x 4 x i64> [[TMP9]], splat (i64 2)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 4 x i64> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -363,7 +363,7 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 2, [[TMP16]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw i64 2, [[TMP16]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 2
@@ -378,7 +378,7 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index 21272cb72f4d6..69d83db49fd18 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -11,8 +11,8 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    br label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 5c78cfd6daded..b62d6da2f2e2d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -44,7 +44,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -222,7 +222,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
index f36919f98dd00..b436a46842eab 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
@@ -22,7 +22,7 @@ define void @store_factor_2_with_tail_gap(i64 %n, ptr %a) {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:    call void @llvm.masked.store.v32i64.p0(<32 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP1]], <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index 06b47aa6551a0..77ee3c08329d5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -16,8 +16,8 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; VLENUNK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
-; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; VLENUNK-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
+; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
 ; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VLENUNK:       vector.body:
 ; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 1cbec47d72203..f49f9284a1e93 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -46,15 +46,15 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV32-NEXT:    [[TMP9:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP9]]
 ; RV32-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV32:       vector.body:
 ; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV32-NEXT:    [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV32-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; RV32-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
-; RV32-NEXT:    [[TMP11:%.*]] = mul i64 16, [[TMP8]]
+; RV32-NEXT:    [[TMP11:%.*]] = mul nuw nsw i64 16, [[TMP8]]
 ; RV32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
@@ -68,7 +68,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
 ; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
 ; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
-; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; RV32-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; RV32:       middle.block:
@@ -115,15 +115,15 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; RV64:       vector.ph:
 ; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV64-NEXT:    [[TMP1:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 16)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; RV64-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV64:       vector.body:
 ; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV64-NEXT:    [[AVL:%.*]] = phi i64 [ 625, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV64-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; RV64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
-; RV64-NEXT:    [[TMP11:%.*]] = mul i64 16, [[TMP8]]
+; RV64-NEXT:    [[TMP11:%.*]] = mul nuw nsw i64 16, [[TMP8]]
 ; RV64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
@@ -137,7 +137,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
 ; RV64-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; RV64-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
-; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; RV64-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV64-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RV64-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; RV64:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
new file mode 100644
index 0000000000000..fa710cb8d65b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @ptr_induction(ptr %p, ptr noalias %q, ptr noalias %p.end) #0 {
+; CHECK-LABEL: define void @ptr_induction(
+; CHECK-SAME: ptr [[P:%.*]], ptr noalias [[Q:%.*]], ptr noalias [[P_END:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P2:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[P_END1:%.*]] = ptrtoint ptr [[P_END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[P_END1]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[P2]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[Q]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[P]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <vscale x 2 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], %[[VECTOR_PH]] ], [ [[PTR_IND7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP1]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint <vscale x 2 x ptr> [[VECTOR_GEP]] to <vscale x 2 x i64>
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]])
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP4]])
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[PTR_IND7]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %ptr.ind = phi ptr [ %p, %entry ], [ %ptr.ind.next, %loop ]
+  %ptri64 = ptrtoint ptr %ptr.ind to i64
+  store i64 %ptri64, ptr %q
+  store i64 %iv, ptr %p
+  %iv.next = add i64 %iv, 1
+  %ptr.ind.next = getelementptr i8, ptr %ptr.ind, i64 1
+  %ec = icmp eq ptr %ptr.ind, %p.end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+v" }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index 8e562a97d51cf..02c363bb54457 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -205,7 +205,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [10 x [12 x i16]], ptr @a, i64 0, i64 8, <8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> zeroinitializer, <8 x ptr> align 2 [[TMP7]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <8 x i64> [[VEC_IND]], splat (i64 24)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 723b5e9cc280d..414e5d9295554 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -115,15 +115,15 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 64)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 64)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 64, [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i64 64, [[TMP11]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
@@ -131,7 +131,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
@@ -156,8 +156,8 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 64
 ; CHECK-UF2-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[BROADCAST_SPLAT]], splat (i64 64)
 ; CHECK-UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], splat (i64 64)
-; CHECK-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-UF2-NEXT:    [[TMP8:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP7]], splat (i64 64)
+; CHECK-UF2-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP8]]
 ; CHECK-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UF2:       vector.body:
 ; CHECK-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -172,7 +172,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; CHECK-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[TMP6]]
+; CHECK-UF2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[STEP_ADD]], [[TMP6]]
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-UF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-UF2:       middle.block:
@@ -1339,8 +1339,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; NOSTRIDED-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NOSTRIDED-UF2-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; NOSTRIDED-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; NOSTRIDED-UF2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP5]], splat (i64 1)
+; NOSTRIDED-UF2-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP6]]
 ; NOSTRIDED-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; NOSTRIDED-UF2:       vector.body:
 ; NOSTRIDED-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1357,7 +1357,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-UF2-NEXT:    store <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8
 ; NOSTRIDED-UF2-NEXT:    store <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8
 ; NOSTRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; NOSTRIDED-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; NOSTRIDED-UF2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-UF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; NOSTRIDED-UF2:       middle.block:
@@ -1423,8 +1423,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
 ; STRIDED-UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; STRIDED-UF2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; STRIDED-UF2-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; STRIDED-UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; STRIDED-UF2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP5]], splat (i64 1)
+; STRIDED-UF2-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP6]]
 ; STRIDED-UF2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; STRIDED-UF2:       vector.body:
 ; STRIDED-UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1441,7 +1441,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-UF2-NEXT:    store <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], ptr [[TMP9]], align 8
 ; STRIDED-UF2-NEXT:    store <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], ptr [[TMP12]], align 8
 ; STRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
-; STRIDED-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; STRIDED-UF2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; STRIDED-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; STRIDED-UF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; STRIDED-UF2:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
index 8ab0f6f4c14f1..317bcde2f4670 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
@@ -1184,8 +1184,8 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br label %[[VECTOR_PH:.*]]
 ; IF-EVL:       [[VECTOR_PH]]:
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], splat (i64 1)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
 ; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1222,8 +1222,8 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
-; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP5]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -1235,7 +1235,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], align 8
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; NO-VP:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
index 34a82757eccc0..c7003560721bc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
@@ -380,8 +380,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; IF-EVL-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP10]], splat (i32 1)
+; IF-EVL-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.body:
 ; IF-EVL-OUTLOOP-NEXT:    [[EVL_BASED_IV1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT1:%.*]], [[VECTOR_BODY]] ]
@@ -400,7 +400,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT1]] = add i64 [[TMP20]], [[EVL_BASED_IV1]]
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
-; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
@@ -415,8 +415,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 1)
-; IF-EVL-INLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP10]]
+; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; IF-EVL-INLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP1]]
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[EVL_BASED_IV1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT1:%.*]], [[VECTOR_BODY]] ]
@@ -435,7 +435,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-INLOOP-NEXT:    [[INDEX_EVL_NEXT1]] = add i64 [[TMP19]], [[EVL_BASED_IV1]]
 ; IF-EVL-INLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
-; IF-EVL-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
@@ -457,8 +457,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
-; NO-VP-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
+; NO-VP-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP12]], splat (i32 1)
+; NO-VP-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP8]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP7]] to i32
 ; NO-VP-OUTLOOP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
 ; NO-VP-OUTLOOP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -473,7 +473,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-OUTLOOP-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI]]
 ; NO-VP-OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; NO-VP-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NO-VP-OUTLOOP:       middle.block:
@@ -513,8 +513,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-VP-INLOOP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
-; NO-VP-INLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP6]], splat (i32 1)
+; NO-VP-INLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP5]]
 ; NO-VP-INLOOP-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP3]] to i32
 ; NO-VP-INLOOP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i64 0
 ; NO-VP-INLOOP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -530,7 +530,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; NO-VP-INLOOP-NEXT:    [[TMP18]] = add i32 [[VEC_PHI]], [[TMP17]]
 ; NO-VP-INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-VP-INLOOP-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NO-VP-INLOOP:       middle.block:
@@ -584,8 +584,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP:       vector.ph:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-OUTLOOP-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; IF-EVL-OUTLOOP-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP13]]
+; IF-EVL-OUTLOOP-NEXT:    [[TMP2:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP10]], splat (i32 1)
+; IF-EVL-OUTLOOP-NEXT:    [[INDUCTION1:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP2]]
 ; IF-EVL-OUTLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-OUTLOOP:       vector.body:
 ; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -604,7 +604,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP14]] to i64
 ; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]]
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
-; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT7]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT2]]
+; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT7]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
@@ -619,8 +619,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_PH:%.*]]
 ; IF-EVL-INLOOP:       vector.ph:
 ; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 1)
-; IF-EVL-INLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP10]]
+; IF-EVL-INLOOP-NEXT:    [[TMP1:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; IF-EVL-INLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP1]]
 ; IF-EVL-INLOOP-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL-INLOOP:       vector.body:
 ; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -638,7 +638,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-INLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[IV]]
 ; IF-EVL-INLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
-; IF-EVL-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
@@ -660,8 +660,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
 ; NO-VP-OUTLOOP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
-; NO-VP-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
+; NO-VP-OUTLOOP-NEXT:    [[TMP8:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP12]], splat (i32 1)
+; NO-VP-OUTLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP8]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP7]] to i32
 ; NO-VP-OUTLOOP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
 ; NO-VP-OUTLOOP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -676,7 +676,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; NO-VP-OUTLOOP-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP28]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[VEC_PHI]]
 ; NO-VP-OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; NO-VP-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-OUTLOOP-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NO-VP-OUTLOOP:       middle.block:
@@ -720,8 +720,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-INLOOP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; NO-VP-INLOOP-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-VP-INLOOP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
-; NO-VP-INLOOP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; NO-VP-INLOOP-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP6]], splat (i32 1)
+; NO-VP-INLOOP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP5]]
 ; NO-VP-INLOOP-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP3]] to i32
 ; NO-VP-INLOOP-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i64 0
 ; NO-VP-INLOOP-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -737,7 +737,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; NO-VP-INLOOP-NEXT:    [[TMP18]] = add i32 [[VEC_PHI]], [[TMP17]]
 ; NO-VP-INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; NO-VP-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; NO-VP-INLOOP-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-INLOOP-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NO-VP-INLOOP:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index c7ba826295de8..a3bec999425a3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -545,8 +545,8 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP18]], 2
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; IF-EVL-NEXT:    [[TMP12:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP12]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = mul nuw i32 [[TMP13]], 2
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP19]], 1
@@ -589,8 +589,8 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
-; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP5]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; NO-VP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
@@ -607,7 +607,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[INDEX]]
 ; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP11]], align 8
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NO-VP-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NO-VP:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index a07e031418762..b5662b0bd8d3b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -119,8 +119,8 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    br label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP3:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 1)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP3]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 1)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -163,8 +163,8 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
-; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP7]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -181,7 +181,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-VP:       middle.block:
@@ -249,8 +249,8 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    br label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], splat (i32 1)
-; IF-EVL-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP5]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i32> [[TMP4]], splat (i32 1)
+; IF-EVL-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP1]]
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[VEC_IND2:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
@@ -285,8 +285,8 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP9]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-VP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
-; NO-VP-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP7]], splat (i32 1)
+; NO-VP-NEXT:    [[INDUCTION1:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP5]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -300,7 +300,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 3
 ; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NO-VP:       middle.block:
@@ -359,8 +359,8 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    br label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP3:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 1)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP3]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP2]], splat (i64 1)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -403,8 +403,8 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
-; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP7]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -421,7 +421,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NO-VP:       middle.block:
@@ -490,8 +490,8 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    br label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i32> [[TMP4]], splat (i32 1)
-; IF-EVL-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP5]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i32> [[TMP4]], splat (i32 1)
+; IF-EVL-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP1]]
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[VEC_IND2:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
@@ -526,8 +526,8 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP9]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-VP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
-; NO-VP-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 4 x i32> [[TMP7]], splat (i32 1)
+; NO-VP-NEXT:    [[INDUCTION1:%.*]] = add nuw nsw <vscale x 4 x i32> zeroinitializer, [[TMP5]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -541,7 +541,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 2
 ; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NO-VP:       middle.block:
@@ -594,8 +594,8 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[N]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 -1)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP5]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul nsw <vscale x 4 x i64> [[TMP4]], splat (i64 -1)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -603,7 +603,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP1]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 -1, [[TMP7]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = mul nsw i64 -1, [[TMP7]]
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
@@ -619,7 +619,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP13]], [[WIDE_MASKED_GATHER5]]
 ; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
-; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
@@ -646,9 +646,9 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[N]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-VP-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], splat (i64 -1)
-; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP8]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = mul i64 -1, [[TMP5]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nsw <vscale x 4 x i64> [[TMP7]], splat (i64 -1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = mul nsw i64 -1, [[TMP5]]
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -668,7 +668,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; NO-VP-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; NO-VP:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1e21c753840e9..0375f0a8fd132 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -249,8 +249,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
-; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -298,7 +298,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
@@ -330,8 +330,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
-; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP1]]
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -590,8 +590,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; SCALABLE-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
-; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -661,8 +661,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -712,8 +712,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; SCALABLE-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
-; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; SCALABLE:       [[VECTOR_BODY]]:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -760,7 +760,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
@@ -793,8 +793,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
-; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
+; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP1]]
 ; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TF-SCALABLE:       [[VECTOR_BODY]]:
 ; TF-SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
index bb85b88f181f7..8536b3f0703cd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
@@ -11,7 +11,7 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
 ; CHECK:  Cost of 0 for VF 2: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
 ; CHECK:  Cost of 1 for VF 2: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
 ; CHECK:  Cost of 0 for VF 2: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK:  Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
+; CHECK:  Cost of 0 for VF 2: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
 ; CHECK:  Cost of 0 for VF 2: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
 ; CHECK:  Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4>
 ; CHECK:  Cost of 0 for VF 2: vp<%5> = vector-pointer ir<%g.src>
@@ -29,7 +29,7 @@ define void @wide_or_replaced_with_add_vpinstruction(ptr %src, ptr noalias %dst)
 ; CHECK:  Cost of 0 for VF 4: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
 ; CHECK:  Cost of 1 for VF 4: exit condition instruction %exitcond = icmp eq i64 %iv.next, 32
 ; CHECK:  Cost of 0 for VF 4: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK:  Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<%0>
+; CHECK:  Cost of 0 for VF 4: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<%0>
 ; CHECK:  Cost of 0 for VF 4: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
 ; CHECK:  Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<%4>
 ; CHECK:  Cost of 0 for VF 4: vp<%5> = vector-pointer ir<%g.src>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 193b5d4a788dc..69505eb176f50 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -48,7 +48,7 @@ define void @PR31671(float %x, ptr %d) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP1]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP4]], <16 x ptr> align 4 [[TMP2]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 80)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 80)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6384
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
index c5f581fad41f5..3165422dcc539 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -34,7 +34,7 @@ define i1 @fn(ptr %nno) #0 {
 ; CHECK-NEXT:    [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index ed288d2f99a0b..fd76aa779bfb0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -31,7 +31,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[IV_START]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <16 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -46,7 +46,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[ARR:%.*]], i64 [[TMP16]]
 ; CHECK-NEXT:    store <16 x i16> [[TMP15]], ptr [[TMP17]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -55,7 +55,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -65,7 +65,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT10:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION11:%.*]] = add <4 x i32> [[DOTSPLAT10]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INDUCTION11:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT10]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -80,9 +80,9 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[ARR]], i64 [[TMP27]]
 ; CHECK-NEXT:    store <4 x i16> [[TMP26]], ptr [[TMP28]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX8]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT13]] = add <4 x i32> [[VEC_IND12]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT13]] = add nuw nsw <4 x i32> [[VEC_IND12]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC4]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -101,7 +101,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    store i16 [[STORE_VAL]], ptr [[ADDR]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp sgt i32 [[IV_TRUNC]], 91
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -179,7 +179,7 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[DOTCAST9:%.*]] = trunc i64 [[N_VEC]] to i16
 ; CHECK-NEXT:    [[IND_END10:%.*]] = mul i16 [[DOTCAST9]], [[TMP0]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -208,7 +208,7 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX12]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT21]] = add <8 x i16> [[VEC_IND20]], [[BROADCAST_SPLAT23]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N25:%.*]] = icmp eq i64 [[L]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N25]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -224,7 +224,7 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX3]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[L]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -265,12 +265,12 @@ define i8 @multiple_inductions_start_at_0() {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 128
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i8> [[STEP_ADD_3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <32 x i8> [[STEP_ADD_3]], i32 31
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -285,7 +285,7 @@ define i8 @multiple_inductions_start_at_0() {
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i32 [[INDEX1]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <4 x i8> [[VEC_IND2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1052
-; CHECK-NEXT:    br i1 [[TMP3]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[VEC_IND2]], i32 3
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -300,7 +300,7 @@ define i8 @multiple_inductions_start_at_0() {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[IV_2]] to i8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ugt i32 [[IV]], 1050
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RES:%.*]] = phi i8 [ [[TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i8 [[RES]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 002d811d46992..12b8d1e15b523 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -222,7 +222,7 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP2]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 36
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index db592f959bace..0bac10a41640e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -134,7 +134,7 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
-; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
+; AVX512-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX512:       middle.block:
@@ -180,7 +180,7 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; FVW2:       pred.store.continue3:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 32)
 ; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FVW2:       middle.block:
@@ -249,7 +249,7 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
+; AVX512-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX512:       middle.block:
@@ -295,7 +295,7 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; FVW2:       pred.store.continue2:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 32)
 ; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FVW2:       middle.block:
@@ -351,7 +351,7 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
-; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
+; AVX512-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX512:       middle.block:
@@ -397,7 +397,7 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; FVW2:       pred.store.continue3:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 32)
 ; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FVW2:       middle.block:
@@ -452,7 +452,7 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
-; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
+; AVX512-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX512:       middle.block:
@@ -498,7 +498,7 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; FVW2:       pred.store.continue3:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 32)
 ; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FVW2:       middle.block:
@@ -553,7 +553,7 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
-; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
+; AVX512-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; AVX512:       middle.block:
@@ -599,7 +599,7 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; FVW2:       pred.store.continue3:
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
-; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 32)
 ; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FVW2:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 9240484c6998b..5eeebf2009a62 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -370,7 +370,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[TMP63]], <32 x i32> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
 ; CHECK-NEXT:    store <32 x i32> [[INTERLEAVED_VEC]], ptr [[TMP56]], align 4, !alias.scope [[META13]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 32)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP64]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
index 0678e9eea4c35..b6d26b4ab46be 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
@@ -35,8 +35,8 @@ define void @avoid_sinking_store_across_load(ptr %arr) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 12)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add nuw nsw <4 x i64> [[VEC_IND2]], splat (i64 12)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 6558f761142f0..932153a23bdbd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1021,7 +1021,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> align 8 [[TMP6]], <8 x i1> [[TMP1]]), !alias.scope [[META29:![0-9]+]], !noalias [[META31:![0-9]+]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 128)
+; AVX512-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i64> [[VEC_IND]], splat (i64 128)
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
 ; AVX512-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; AVX512:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 5e1850be132bd..a8c5bb0ee6f3b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -173,7 +173,7 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], splat (i32 64)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <64 x i32> [[VEC_IND]], splat (i32 64)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -198,7 +198,7 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; AUTOVF-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; AUTOVF-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; AUTOVF-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AUTOVF:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 113bb7a7f2aca..4eaadcb93e3de 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -50,7 +50,7 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR_BODY31]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
index ff5020cd60138..1350e40c77e66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
@@ -29,7 +29,7 @@ define void @foo(ptr %ptr, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i64> [[VEC_IND]], ptr [[TMP6]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
index e7f56a45ebdc6..34c54de2140cc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
@@ -38,7 +38,7 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
index e888ad3b8eb4e..fb5a5f9c068b9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
@@ -36,7 +36,7 @@ define void @test_pr55096(i64 %c, ptr %p) {
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 340
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index ba7db65d745a9..08855fe9ecba5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -35,7 +35,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 8 [[TMP8]], <4 x i1> [[REVERSE]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index e405fe7c6f764..9ef4e205a970d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -59,8 +59,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP11]], <16 x i64> [[TMP15]], i64 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 8 [[TMP16]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 32)
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <16 x i64> [[VEC_IND3]], splat (i64 32)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <16 x i64> [[VEC_IND]], splat (i64 32)
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add nuw nsw <16 x i64> [[VEC_IND3]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP63]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -83,15 +83,15 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END11:%.*]] = mul i64 [[N_VEC7]], 2
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <8 x i64> [[DOTSPLAT]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    [[DOTSPLATINSERT17:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT18:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT17]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION19:%.*]] = add <8 x i64> [[DOTSPLAT18]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
+; CHECK-NEXT:    [[INDUCTION9:%.*]] = add nuw nsw <8 x i64> [[DOTSPLAT18]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <8 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND20:%.*]] = phi <8 x i64> [ [[INDUCTION19]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT21:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <8 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND20:%.*]] = phi <8 x i64> [ [[INDUCTION9]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND15]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <8 x i64> [[TMP18]], [[VEC_IND20]]
@@ -102,8 +102,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP19]], <8 x i64> [[TMP23]], i64 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 8 [[TMP24]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX14]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <8 x i64> [[VEC_IND15]], splat (i64 16)
-; CHECK-NEXT:    [[VEC_IND_NEXT21]] = add <8 x i64> [[VEC_IND20]], splat (i64 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT14]] = add nuw nsw <8 x i64> [[VEC_IND15]], splat (i64 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT15]] = add nuw nsw <8 x i64> [[VEC_IND20]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -134,8 +134,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY30:.*]]
 ; CHECK:       [[VECTOR_BODY30]]:
 ; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, %[[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], %[[VECTOR_BODY30]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], %[[VECTOR_BODY30]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], %[[VECTOR_BODY30]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT35:%.*]], %[[VECTOR_BODY30]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], %[[VECTOR_BODY30]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
@@ -148,8 +148,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> align 16 [[TMP33]], <16 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> align 8 [[TMP37]], <16 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[INDEX_NEXT39]] = add nuw i64 [[INDEX34]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
-; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
+; CHECK-NEXT:    [[VEC_IND_NEXT35]] = add nuw nsw <16 x i64> [[VEC_IND35]], splat (i64 32)
+; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add nuw nsw <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK37:.*]], label %[[VECTOR_BODY30]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK37]]:
@@ -175,15 +175,15 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    [[DOTSPLATINSERT62:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL42]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT63:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT62]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION64:%.*]] = add <8 x i64> [[DOTSPLAT63]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
+; CHECK-NEXT:    [[INDUCTION52:%.*]] = add nuw nsw <8 x i64> [[DOTSPLAT63]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
+; CHECK-NEXT:    [[INDUCTION55:%.*]] = add nuw nsw <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY56:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY56]]:
 ; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL39]], %[[VEC_EPILOG_PH45]] ], [ [[INDEX_NEXT74:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT66:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT71:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION52]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT61:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION55]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT62:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
@@ -196,8 +196,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> align 16 [[TMP47]], <8 x i1> [[BROADCAST_SPLAT73]])
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> align 8 [[TMP51]], <8 x i1> [[BROADCAST_SPLAT73]])
 ; CHECK-NEXT:    [[INDEX_NEXT74]] = add nuw i64 [[INDEX61]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
-; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT61]] = add nuw nsw <8 x i64> [[VEC_IND65]], splat (i64 16)
+; CHECK-NEXT:    [[VEC_IND_NEXT62]] = add nuw nsw <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[TMP55]], label %[[VEC_EPILOG_MIDDLE_BLOCK63:.*]], label %[[VEC_EPILOG_VECTOR_BODY56]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK63]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
index eca70b3af159c..86ac78be5bda9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
@@ -29,7 +29,7 @@ define void @load_store_interleave_group_with_gaps(ptr noalias %data, i64 nounde
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DATA]], <4 x i64> [[TMP5]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC2]], <4 x ptr> align 8 [[TMP6]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
index efc9a4fa57292..8184cad22ae8b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
@@ -44,7 +44,7 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr align 4 [[TMP34]], <4 x i1> [[TMP18]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr align 4 [[TMP35]], <4 x i1> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index f9570405ecabc..b5d42a8f71430 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -39,7 +39,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; DISABLED_MASKED_STRIDED:       vector.body:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp samesign ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -124,7 +124,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP51]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -139,7 +139,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp samesign ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -148,7 +148,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP3]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1016
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.body:
@@ -212,7 +212,7 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED:       vector.body:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp samesign ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -297,7 +297,7 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP51]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -312,7 +312,7 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp samesign ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -322,7 +322,7 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP4]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
@@ -903,7 +903,7 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[TMP32]], ptr [[TMP33]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP34]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -1162,7 +1162,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; DISABLED_MASKED_STRIDED:       vector.body:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp samesign ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1472,7 +1472,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE60]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue60:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP165]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -1487,7 +1487,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp samesign ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -1500,7 +1500,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP6]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
@@ -1877,7 +1877,7 @@ define dso_local void @masked_strided2_reverse(ptr noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE60]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue60:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 -8)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 -8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP165]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -2202,7 +2202,7 @@ define dso_local void @masked_strided2_reverse(ptr noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE60]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue60:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 -8)
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND]], splat (i32 -8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP165]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index 500e60372ac13..cfc588a7f5d89 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -64,7 +64,7 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], ptr [[TMP21]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP28]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -352,7 +352,7 @@ define dso_local void @test(ptr noalias nocapture %points, ptr noalias nocapture
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue6:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP19]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DISABLED_MASKED_STRIDED:       for.end:
@@ -405,7 +405,7 @@ define dso_local void @test(ptr noalias nocapture %points, ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; ENABLED_MASKED_STRIDED:       pred.store.continue6:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP19]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index 67fe87a328976..5999a3581e467 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -339,7 +339,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP1]], <8 x i32> [[TMP64]], <8 x i32> [[BROADCAST_SPLAT16]]
 ; SINK-GATHER-NEXT:    [[TMP66]] = add <8 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; SINK-GATHER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SINK-GATHER-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; SINK-GATHER-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i64> [[VEC_IND]], splat (i64 8)
 ; SINK-GATHER-NEXT:    [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-GATHER-NEXT:    br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SINK-GATHER:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index b41ddebc0c362..a9a0b33f542af 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -161,7 +161,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index b59ad8481597c..af72f3641a635 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -17,7 +17,7 @@ define void @_Z3foov() {
 ; CHECK:  vector.ph:
 ; CHECK:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:  vector.body:
-; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:    br i1 [[TMP4:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:  middle.block:
 ; CHECK:    br label [[FOR_BODY:%.*]]
 ; CHECK:  for.cond.cleanup:
@@ -28,7 +28,7 @@ define void @_Z3foov() {
 ; CHECK-MASKED:  vector.ph:
 ; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MASKED:  vector.body:
-; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-MASKED:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK-MASKED:  middle.block:
 ; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
 ; CHECK-MASKED:  for.cond.cleanup:
@@ -41,15 +41,15 @@ define void @_Z3foov() {
 ; CHECK-SCALABLE:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SCALABLE:  vector.body:
-; CHECK-SCALABLE:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
-; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-SCALABLE:    [[VEC_IND_NEXT:%.*]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
+; CHECK-SCALABLE:    br i1 [[TMP11:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-SCALABLE:  middle.block:
-; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF6:![0-9]+]]
 ; CHECK-SCALABLE:  scalar.ph:
 ; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
 ; CHECK-SCALABLE:  for.cond.cleanup:
 ; CHECK-SCALABLE:  for.body:
-; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -80,14 +80,14 @@ define void @_Z3foo2v() {
 ; CHECK:  vector.ph:
 ; CHECK:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:  vector.body:
-; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:    br i1 [[TMP4:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:  middle.block:
-; CHECK:    br label [[SCALAR_PH:%.+]]
+; CHECK:    br label [[SCALAR_PH:%.*]]
 ; CHECK:  scalar.ph:
 ; CHECK:    br label [[FOR_BODY:%.*]]
 ; CHECK:  for.cond.cleanup:
 ; CHECK:  for.body:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; CHECK-MASKED-LABEL: @_Z3foo2v(
 ; CHECK-MASKED:  entry:
@@ -95,14 +95,14 @@ define void @_Z3foo2v() {
 ; CHECK-MASKED:  vector.ph:
 ; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MASKED:  vector.body:
-; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MASKED:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-MASKED:  middle.block:
-; CHECK-MASKED:    br label [[SCALAR_PH:%.+]]
+; CHECK-MASKED:    br label [[SCALAR_PH:%.*]]
 ; CHECK-MASKED:  scalar.ph:
 ; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
 ; CHECK-MASKED:  for.cond.cleanup:
 ; CHECK-MASKED:  for.body:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 ; CHECK-SCALABLE-LABEL: @_Z3foo2v(
 ; CHECK-SCALABLE:  entry:
@@ -112,15 +112,15 @@ define void @_Z3foo2v() {
 ; CHECK-SCALABLE:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SCALABLE:  vector.body:
-; CHECK-SCALABLE:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
-; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SCALABLE:    [[VEC_IND_NEXT:%.*]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
+; CHECK-SCALABLE:    br i1 [[TMP11:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-SCALABLE:  middle.block:
-; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF6]]
 ; CHECK-SCALABLE:  scalar.ph:
 ; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
 ; CHECK-SCALABLE:  for.cond.cleanup:
 ; CHECK-SCALABLE:  for.body:
-; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF11:![0-9]+]], !llvm.loop [[LOOP12:![0-9]+]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 45405511abe90..1fe3962dfd072 100644
--- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -1612,7 +1612,7 @@ define void @pr61396_pointer_used_as_both_stored_value_and_pointer_operand_by_st
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 0
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -1642,7 +1642,7 @@ define void @pr61396_pointer_used_as_both_stored_value_and_pointer_operand_by_st
 ; INTER-NEXT:    [[TMP1:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 0
 ; INTER-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP1]], align 8
 ; INTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; INTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; INTER-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; INTER-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
 ; INTER-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; INTER:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll b/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll
index 49eb8b349a274..5d92c127aff93 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-gep-source-element-type.ll
@@ -87,7 +87,7 @@ define void @cse_wide_gep(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %n
 ; CHECK-NEXT:    store <4 x ptr> [[TMP2]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 5177d7b6e0090..1773b2aab7d87 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -43,12 +43,13 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -102,8 +103,9 @@ define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
@@ -172,12 +174,13 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -247,12 +250,13 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -322,12 +326,13 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -397,12 +402,13 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -468,12 +474,13 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -543,12 +550,13 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -621,7 +629,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -679,8 +687,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -750,8 +759,9 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 3999) ]
@@ -803,8 +813,9 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -857,8 +868,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -928,8 +940,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -999,8 +1012,9 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 3999) ]
@@ -1068,8 +1082,9 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
@@ -1138,8 +1153,9 @@ define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_c
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %a = call ptr @get_ptr()
@@ -1195,8 +1211,9 @@ define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(p
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1267,8 +1284,9 @@ define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1339,8 +1357,9 @@ define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_b
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1418,8 +1437,9 @@ define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predec
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index 5e3a70222d7bb..6c9e3694b2a34 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -23,7 +23,7 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -46,18 +46,18 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT9]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[DOTSPLAT9]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD8]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND5]], <4 x i64> [[VEC_PHI7]]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[TMP7]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add nuw nsw <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -124,7 +124,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -147,18 +147,18 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT8]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT9]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[DOTSPLAT9]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD8]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND5]], <4 x i64> [[VEC_PHI7]]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[TMP7]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add nuw nsw <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -231,7 +231,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP7]], <4 x i8> [[VEC_IND]], <4 x i8> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -257,11 +257,11 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT9]], <i8 0, i8 1, i8 2, i8 3>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i8> [[DOTSPLAT9]], <i8 0, i8 1, i8 2, i8 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX4]] to i8
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[OFFSET_IDX]]
@@ -269,7 +269,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD10]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP16]], <4 x i8> [[VEC_IND5]], <4 x i8> [[VEC_PHI7]]
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i32 [[INDEX4]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i8> [[VEC_IND5]], splat (i8 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add nuw nsw <4 x i8> [[VEC_IND5]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT11]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index 15daf90ad770c..813c040bd78f7 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -495,7 +495,7 @@ define i64 @test_reduction_with_widen_induction_order_1(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <4 x i64> [[VEC_IND]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
@@ -513,7 +513,7 @@ define i64 @test_reduction_with_widen_induction_order_1(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -524,7 +524,7 @@ define i64 @test_reduction_with_widen_induction_order_1(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP6]] = add <4 x i64> [[VEC_PHI6]], [[WIDE_LOAD7]]
 ; CHECK-NEXT:    store <4 x i64> [[VEC_IND5]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX4]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add nuw nsw <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -589,7 +589,7 @@ define i64 @test_reduction_with_widen_induction_order_2(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP1]] = add <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <4 x i64> [[VEC_IND]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
@@ -607,7 +607,7 @@ define i64 @test_reduction_with_widen_induction_order_2(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
@@ -618,7 +618,7 @@ define i64 @test_reduction_with_widen_induction_order_2(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP6]] = add <4 x i64> [[VEC_PHI5]], [[WIDE_LOAD7]]
 ; CHECK-NEXT:    store <4 x i64> [[VEC_IND6]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX4]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND6]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add nuw nsw <4 x i64> [[VEC_IND6]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index d3c8c1304b588..2f05435bc75ba 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -689,6 +689,17 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr, i64 %n
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = add i64 1, [[INDEX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[OFFSET_IDX1]]
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    store <4 x double> [[TMP8]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[INDEX]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
@@ -801,7 +812,7 @@ define i64 @test_first_order_recurrences_and_induction(ptr %ptr, i64 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -865,7 +876,7 @@ define i64 @test_first_order_recurrences_and_induction2(ptr %ptr, i64 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index bd0c098d335a2..cebd52fa7f866 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -875,14 +875,14 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
+; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[STEP_ADD]], splat (i32 -4)
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -965,13 +965,13 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; SINK-AFTER-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
+; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index 53d5ac472c892..1c7ec9b28bd71 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -37,19 +37,19 @@ define void @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N)
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
-; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <8 x i32> [[DOTSPLAT]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i32 [[TMP0]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -126,19 +126,19 @@ define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N,
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> poison, i32 [[J_012]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
-; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[J_012]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <8 x i32> [[DOTSPLAT]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i32 [[J_012]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP6]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -219,19 +219,19 @@ define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[STEP]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
-; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <8 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <8 x i32> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw nsw i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP6]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 66e4de5da7955..76fa6bdb543a7 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -29,7 +29,7 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -70,7 +70,7 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; IND-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; IND-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IND:       middle.block:
@@ -159,7 +159,7 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP6]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -1578,7 +1578,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP18]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP19]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1631,18 +1631,18 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; IND-NEXT:    [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; IND-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0
 ; IND-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1
-; IND-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
-; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
-; IND-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17:![0-9]+]]
-; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope [[META17]]
+; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP11]]
+; IND-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
+; IND-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope [[META17:![0-9]+]]
+; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17]]
 ; IND-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; IND-NEXT:    [[TMP17:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 12
-; IND-NEXT:    store i32 [[TMP25]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; IND-NEXT:    store i32 [[TMP14]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; IND-NEXT:    store i32 [[TMP15]], ptr [[TMP18]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; IND-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; IND:       middle.block:
@@ -1801,7 +1801,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP25]], ptr [[TMP29]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP26]], ptr [[TMP30]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -2439,7 +2439,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
@@ -2488,7 +2488,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; IND-NEXT:    store i16 [[TMP8]], ptr [[TMP6]], align 2
 ; IND-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; IND:       middle.block:
@@ -2611,7 +2611,7 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -4048,7 +4048,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4081,7 +4081,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; IND-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; IND:       middle.block:
@@ -4155,7 +4155,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -4249,7 +4249,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4287,7 +4287,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; IND-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
 ; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; IND-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; IND:       middle.block:
@@ -4377,7 +4377,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP9]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -4470,7 +4470,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[I]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -4479,7 +4479,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4508,7 +4508,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; IND-NEXT:    [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[I]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; IND-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -4518,7 +4518,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; IND-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
 ; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; IND-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; IND:       middle.block:
@@ -4548,7 +4548,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]]
 ; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[I]], i64 0
 ; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -4592,7 +4592,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[I]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -4604,7 +4604,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -4633,7 +4633,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = add i32 [[I]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i64 0
 ; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; INTERLEAVE-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -4698,7 +4698,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4736,7 +4736,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; IND-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; IND-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 4)
 ; IND-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; IND:       middle.block:
@@ -4819,7 +4819,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 4)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -4943,7 +4943,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP15]] = or <2 x i32> [[VEC_PHI]], [[TMP14]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; CHECK:       middle.block:
@@ -4986,7 +4986,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; IND-NEXT:    [[TMP13:%.*]] = sext <2 x i16> [[PREDPHI]] to <2 x i32>
 ; IND-NEXT:    [[TMP14]] = or <2 x i32> [[VEC_PHI]], [[TMP13]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i16> [[VEC_IND]], splat (i16 2)
 ; IND-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20
 ; IND-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; IND:       middle.block:
@@ -5121,7 +5121,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL-NO-IC-NEXT:    [[TMP28]] = or <2 x i32> [[VEC_PHI]], [[TMP26]]
 ; UNROLL-NO-IC-NEXT:    [[TMP29]] = or <2 x i32> [[VEC_PHI1]], [[TMP27]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[STEP_ADD]], splat (i16 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i16> [[STEP_ADD]], splat (i16 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -5309,9 +5309,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP10]] = add <2 x i64> [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i32> [[VEC_IND2]], splat (i32 2)
-; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <2 x i32> [[VEC_IND4]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add nuw nsw <2 x i32> [[VEC_IND2]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add nuw nsw <2 x i32> [[VEC_IND4]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
 ; CHECK:       middle.block:
@@ -5360,18 +5360,18 @@ define i64 @trunc_with_first_order_recurrence() {
 ; IND-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND2]], <2 x i32> <i32 1, i32 2>
 ; IND-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[VEC_IND]], [[VEC_IND2]]
 ; IND-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[TMP1]], splat (i32 42)
-; IND-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP0]], [[VEC_IND2]]
+; IND-NEXT:    [[TMP3:%.*]] = add nuw <2 x i32> [[TMP0]], [[VEC_IND2]]
 ; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], [[TMP2]]
 ; IND-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
 ; IND-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[VEC_PHI]], [[TMP5]]
-; IND-NEXT:    [[TMP7:%.*]] = shl <2 x i32> [[VEC_IND4]], splat (i32 1)
+; IND-NEXT:    [[TMP7:%.*]] = shl nuw <2 x i32> [[VEC_IND4]], splat (i32 1)
 ; IND-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP2]], [[TMP7]]
 ; IND-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
 ; IND-NEXT:    [[TMP10]] = add <2 x i64> [[TMP6]], [[TMP9]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; IND-NEXT:    [[VEC_IND_NEXT3]] = add <2 x i32> [[VEC_IND2]], splat (i32 2)
-; IND-NEXT:    [[VEC_IND_NEXT5]] = add <2 x i32> [[VEC_IND4]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT3]] = add nuw nsw <2 x i32> [[VEC_IND2]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT5]] = add nuw nsw <2 x i32> [[VEC_IND4]], splat (i32 2)
 ; IND-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112
 ; IND-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
 ; IND:       middle.block:
@@ -5518,9 +5518,9 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[TMP20]] = add <2 x i64> [[TMP12]], [[TMP18]]
 ; UNROLL-NO-IC-NEXT:    [[TMP21]] = add <2 x i64> [[TMP13]], [[TMP19]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i32> [[STEP_ADD7]], splat (i32 2)
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT6]] = add <2 x i32> [[STEP_ADD8]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT4]] = add nuw nsw <2 x i32> [[STEP_ADD7]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT6]] = add nuw nsw <2 x i32> [[STEP_ADD8]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -5683,7 +5683,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; CHECK:       middle.block:
@@ -5711,7 +5711,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; IND-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP2]]
 ; IND-NEXT:    store <2 x i32> [[TMP5]], ptr [[TMP4]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; IND-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; IND-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; IND:       middle.block:
@@ -5778,7 +5778,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP9]], ptr [[TMP11]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -5900,9 +5900,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[STEP]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> zeroinitializer, [[TMP19]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw i32 [[STEP]], 2
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -5914,7 +5914,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
 ; CHECK:       middle.block:
@@ -5972,8 +5972,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
 ; IND-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
 ; IND-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT:    [[TMP15:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
-; IND-NEXT:    [[TMP16:%.*]] = shl i32 [[STEP]], 1
+; IND-NEXT:    [[TMP15:%.*]] = mul nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; IND-NEXT:    [[TMP16:%.*]] = shl nsw i32 [[STEP]], 1
 ; IND-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0
 ; IND-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -5985,7 +5985,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; IND-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
 ; IND-NEXT:    store <2 x i32> [[TMP17]], ptr [[TMP18]], align 4
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
+; IND-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; IND-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IND-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
 ; IND:       middle.block:
@@ -6044,7 +6044,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
 ; UNROLL-NEXT:    [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLAT]], splat (i32 1)
-; UNROLL-NEXT:    [[TMP17:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NEXT:    [[TMP17:%.*]] = mul nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -6058,7 +6058,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    store <2 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[TMP16]]
+; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[STEP_ADD]], [[TMP16]]
 ; UNROLL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
 ; UNROLL:       middle.block:
@@ -6120,8 +6120,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = mul <2 x i32> splat (i32 2), [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP18]]
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = mul nsw <2 x i32> <i32 0, i32 1>, [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> zeroinitializer, [[TMP19]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -6135,7 +6135,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP22]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP21]], ptr [[TMP24]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[TMP17]]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[STEP_ADD]], [[TMP17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
@@ -6194,7 +6194,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLAT]], splat (i32 2)
-; INTERLEAVE-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = mul nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -6208,7 +6208,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], [[TMP16]]
+; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[STEP_ADD]], [[TMP16]]
 ; INTERLEAVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; INTERLEAVE:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 8975c058c6b79..da8efa77bf5e6 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -105,7 +105,7 @@ define i32 @cond_branch(i32 %a, ptr %src) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> splat (i32 10)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -152,7 +152,7 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll
index d6a6fded712a5..7224e26d6a0e6 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-gep-nowrap-flags.ll
@@ -33,12 +33,13 @@ define void @nusw_preservation(ptr noalias %A, ptr %B) {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -94,12 +95,13 @@ define void @inbounds_preservation(ptr noalias %A, ptr %B) {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -155,12 +157,13 @@ define void @nuw_drop(ptr noalias %A, ptr %B) {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -209,8 +212,9 @@ define void @nusw_preservation_2(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -258,8 +262,9 @@ define void @inbounds_preservation_2(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -307,8 +312,9 @@ define void @nuw_drop_2(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index fc2e2337e0569..de079374eb401 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -345,7 +345,7 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
@@ -595,7 +595,7 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    store i64 [[TMP27]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
@@ -721,7 +721,7 @@ define void @mixed_load3_store3(ptr nocapture %A) {
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1157,13 +1157,13 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    [[DOTSPLIT3:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[DOTSPLIT3:%.*]] = getelementptr inbounds nuw [[PAIR_I32]], ptr [[P]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT3]], i64 4
-; CHECK-NEXT:    [[DOTSPLIT4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[DOTSPLIT4:%.*]] = getelementptr inbounds nuw [[PAIR_I32]], ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT4]], i64 4
-; CHECK-NEXT:    [[DOTSPLIT5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[DOTSPLIT5:%.*]] = getelementptr inbounds nuw [[PAIR_I32]], ptr [[P]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT5]], i64 4
-; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds nuw [[PAIR_I32]], ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT6]], i64 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
@@ -1178,7 +1178,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1346,12 +1346,12 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
@@ -1385,7 +1385,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index 70b1ea13677b8..69d2aa4c620c1 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -23,7 +23,7 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC1VF4-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
 ; IC1VF4-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; IC1VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
 ; IC1VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
@@ -77,7 +77,7 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC4VF4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; IC4VF4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
 ; IC4VF4-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
 ; IC4VF4-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
@@ -180,7 +180,7 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    [[TMP4:%.*]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -1)
 ; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; IC1VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
@@ -398,7 +398,7 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF4-NEXT:    [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]]
 ; IC4VF4-NEXT:    [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
+; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
 ; IC4VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
 ; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
@@ -506,7 +506,7 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    [[TMP4:%.*]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -1)
 ; IC1VF4-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; IC1VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
@@ -724,7 +724,7 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF4-NEXT:    [[TMP114:%.*]] = select <4 x i1> [[TMP2]], <4 x i16> [[TMP110]], <4 x i16> [[VEC_PHI2]]
 ; IC4VF4-NEXT:    [[TMP115:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> [[TMP111]], <4 x i16> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
+; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[STEP_ADD_3]], splat (i16 -4)
 ; IC4VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
 ; IC4VF4-NEXT:    [[RDX_MINMAX:%.*]] = call <4 x i16> @llvm.smin.v4i16(<4 x i16> [[TMP112]], <4 x i16> [[TMP113]])
@@ -829,7 +829,7 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC1VF4-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
 ; IC1VF4-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; IC1VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; IC1VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
 ; IC1VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IC1VF4:       [[MIDDLE_BLOCK]]:
@@ -883,7 +883,7 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC4VF4-NEXT:    [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; IC4VF4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; IC4VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
 ; IC4VF4-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
 ; IC4VF4-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IC4VF4:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index c958ea7b9b88e..21ef1885b75b9 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -22,7 +22,7 @@ define i64 @select_icmp_nuw_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -92,7 +92,7 @@ define i64 @select_icmp_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index b991d58eb2b8d..c55b089043e25 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -30,7 +30,7 @@ define i32 @select_icmp_const_truncated_iv_widened_exit(ptr %a, i32 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -101,7 +101,7 @@ define i32 @select_icmp_const_truncated_iv_widened_exit(ptr %a, i32 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -253,7 +253,7 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -297,7 +297,7 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -396,7 +396,7 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -440,7 +440,7 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -544,7 +544,7 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -589,7 +589,7 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[TMP11]] = select <4 x i1> [[TMP7]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index 91c83103bf8f2..8d3bd267b9482 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -22,7 +22,7 @@ define i64 @select_icmp_const_1(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -85,7 +85,7 @@ define i64 @select_icmp_const_1(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -215,7 +215,7 @@ define i64 @select_icmp_const_2(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -278,7 +278,7 @@ define i64 @select_icmp_const_2(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[VEC_PHI2]], <4 x i64> [[STEP_ADD_2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_PHI3]], <4 x i64> [[STEP_ADD_3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -408,7 +408,7 @@ define i64 @select_icmp_const_3_variable_rdx_start(ptr %a, i64 %rdx.start, i64 %
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -471,7 +471,7 @@ define i64 @select_icmp_const_3_variable_rdx_start(ptr %a, i64 %rdx.start, i64 %
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -601,7 +601,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -664,7 +664,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -794,7 +794,7 @@ define i64 @select_fcmp_const(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -857,7 +857,7 @@ define i64 @select_fcmp_const(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP12]] = select <4 x i1> [[TMP8]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -989,7 +989,7 @@ define i64 @select_icmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -1062,7 +1062,7 @@ define i64 @select_icmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -1208,7 +1208,7 @@ define i64 @select_fcmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -1281,7 +1281,7 @@ define i64 @select_fcmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -1428,7 +1428,7 @@ define i64 @select_icmp_min_valid_iv_start(ptr %a, ptr %b, i64 %rdx.start, i64 %
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -1505,7 +1505,7 @@ define i64 @select_icmp_min_valid_iv_start(ptr %a, ptr %b, i64 %rdx.start, i64 %
 ; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[TMP18]] = select <4 x i1> [[TMP14]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI4]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
@@ -1661,7 +1661,7 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-VF4IC1-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372036854775804
 ; CHECK-VF4IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
@@ -1730,7 +1730,7 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC4-NEXT:    [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
 ; CHECK-VF4IC4-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-VF4IC4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372036854775792
 ; CHECK-VF4IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 86515ebe25637..162803a377bc0 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -388,7 +388,7 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; VEC-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP1]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add nuw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
@@ -448,7 +448,7 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; VEC-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP1]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add nuw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
@@ -597,7 +597,7 @@ define i32 @postinc_not_iv_backedge_value(i32 %k)  {
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; VEC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; VEC-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 00256a5c4a456..4b3f370f4cdaf 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -418,7 +418,7 @@ define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %des
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
@@ -577,7 +577,7 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
@@ -684,7 +684,7 @@ define void @adding_offset_overflows(i32 %n, ptr %A) {
 ; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -707,7 +707,7 @@ define void @adding_offset_overflows(i32 %n, ptr %A) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index f8ddd344f5587..a598f154ef54b 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -21,7 +21,7 @@ define void @vector_gep(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x ptr> [[TMP0]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
index 1e4a98d22bf17..9c73d85bd58ff 100644
--- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -23,14 +23,14 @@ define i32 @test1()  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -38,18 +38,18 @@ define i32 @test1()  {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
-; CHECK:       [[_LR_PH_I]]:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
+; CHECK:       [[_LR_PH_I1]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I1:.*:]]
-; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
 ; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
 ; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
 ; CHECK:       [[UNNAMEDBB10]]:
@@ -96,14 +96,14 @@ define i32 @test2()  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -111,18 +111,18 @@ define i32 @test2()  {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
-; CHECK:       [[_LR_PH_I]]:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
+; CHECK:       [[_LR_PH_I1]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I1:.*:]]
-; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
 ; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
 ; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
 ; CHECK:       [[UNNAMEDBB10]]:
@@ -169,7 +169,7 @@ define i32 @test3(i32 %N)  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
@@ -178,7 +178,7 @@ define i32 @test3(i32 %N)  {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -189,18 +189,18 @@ define i32 @test3(i32 %N)  {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> zeroinitializer, <2 x i32> splat (i32 2)
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[PREDPHI]], <2 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[PREDPHI1]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
-; CHECK:       [[_LR_PH_I1]]:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
+; CHECK:       [[_LR_PH_I]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I:.*:]]
-; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
+; CHECK:       [[_LR_PH_I1:.*:]]
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
 ; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
 ; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
 ; CHECK:       [[UNNAMEDBB10]]:
@@ -257,14 +257,14 @@ define i32 @test4(i32 %N)  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -272,18 +272,18 @@ define i32 @test4(i32 %N)  {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
-; CHECK:       [[_LR_PH_I]]:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
+; CHECK:       [[_LR_PH_I1]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[DOTLR_PH_I_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I1:.*:]]
-; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
 ; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
 ; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
 ; CHECK:       [[UNNAMEDBB10]]:
@@ -519,14 +519,14 @@ define i8 @outside_user_non_phi()  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -535,18 +535,18 @@ define i8 @outside_user_non_phi()  {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i32> [[PREDPHI]] to <2 x i8>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
-; CHECK:       [[_LR_PH_I]]:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
+; CHECK:       [[_LR_PH_I1]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I1:.*:]]
-; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[UNNAMEDTMP8:%.*]] = phi i32 [ [[UNNAMEDTMP18:%.*]], %[[BB16:.*]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
 ; CHECK-NEXT:    [[UNNAMEDTMP2:%.*]] = icmp sgt i32 [[UNNAMEDTMP8]], 10
 ; CHECK-NEXT:    br i1 [[UNNAMEDTMP2]], label %[[BB16]], label %[[UNNAMEDBB10:.*]]
 ; CHECK:       [[UNNAMEDBB10]]:
@@ -650,14 +650,14 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N)  {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[B_PROMOTED]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[_LR_PH_I1:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[C1]], [[B2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[C1]], [[A3]]
 ; CHECK-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i32 [[TMP3]], 8
 ; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[_LR_PH_I]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[_LR_PH_I1]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
@@ -680,12 +680,12 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N)  {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I]]
-; CHECK:       [[_LR_PH_I]]:
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[_LR_PH_I1]]
+; CHECK:       [[_LR_PH_I1]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ], [ [[B_PROMOTED]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[DOTLR_PH_I:.*]]
-; CHECK:       [[_LR_PH_I1:.*:]]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I]] ]
+; CHECK:       [[_LR_PH_I:.*:]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[_LR_PH_I1]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = sext i32 [[IV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[BLOAD:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
@@ -742,7 +742,7 @@ define i32 @non_uniform_live_out() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i8> [[WIDE_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    store <2 x i8> [[TMP4]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
index 481fa04cf7164..30d01e8b790a7 100644
--- a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
+++ b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
@@ -175,7 +175,7 @@ define void @predicated_noalias_scope_decl(ptr noalias nocapture readonly %a, pt
 ; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index bfc7feecafbc4..81095290bcc92 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -654,7 +654,7 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -670,16 +670,16 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[VEC_EPILOG_RESUME_VAL]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i32> [[VEC_IND7]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX6]]
 ; CHECK-NEXT:    store <4 x i8> [[TMP7]], ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT8]] = add <4 x i32> [[VEC_IND7]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add nuw nsw <4 x i32> [[VEC_IND7]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -720,7 +720,7 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP2]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[MIDDLE_BLOCK]]:
@@ -736,16 +736,16 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = trunc i64 [[VEC_EPILOG_RESUME_VAL]] to i32
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <2 x i32> [[VEC_IND7]] to <2 x i8>
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX6]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[TMP7]], ptr [[TMP8]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], splat (i32 2)
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT7]] = add nuw nsw <2 x i32> [[VEC_IND7]], splat (i32 2)
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP10]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index f9f7feb7bdfbc..f57041257c919 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -571,7 +571,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[TMP0:%.*]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; NPGSO-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
 ; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
index 8a77d14b9d3ba..f7986d8f1f928 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
@@ -40,7 +40,7 @@ define void @inner_latch_header_first_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -142,7 +142,7 @@ define void @inner_latch_header_second_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP9]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
index 2e17f7adca279..32dbc273cb6b6 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
@@ -42,7 +42,7 @@ define void @test(ptr %src, i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_1_LATCH5]], label [[LOOP_2_HEADER1]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
index 59e3d71f27a38..afd5000818bfc 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
@@ -36,7 +36,7 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -144,7 +144,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
index b2f1954ca989b..020f84209b23b 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
@@ -53,7 +53,7 @@ define void @non_outermost_loop_hcfg_construction(i64 %n, ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_LOOP_LATCH4]], label [[INNERMOST_LOOP3]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -196,7 +196,7 @@ define void @non_outermost_loop_hcfg_construction_other_loops_at_same_level(i64
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_LOOP_J0_CLEANUP4]], label [[INNERMOST_LOOP3]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
index 47743753349ed..b224a5a86b83d 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
@@ -25,8 +25,8 @@ define void @foo() {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP4]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -50,7 +50,7 @@ define void @foo() {
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ [[TMP12]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI5]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
index 4086c79082cce..0a19e0ec55a4b 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -48,7 +48,7 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR_BODY31]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
index fb9b1c7d62e3e..5fcd6db2eb28a 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
@@ -50,7 +50,7 @@
 
 ; CHECK: [[ForInc]]:
 ; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4
-; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], splat (i64 4)
+; CHECK: %[[VecIndNext]] = add nuw nsw <4 x i64> %[[VecInd]], splat (i64 4)
 ; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], {{.*}}
 ; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
index c491477c4d2be..623a9435edec1 100644
--- a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -71,9 +71,9 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP19]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw i32 [[STEP]], 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -83,7 +83,7 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -192,9 +192,9 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP18]]
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw i32 [[STEP]], 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -204,7 +204,7 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -385,9 +385,9 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[CONV]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[CONV]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[CONV]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP16]]
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i32 [[CONV]], 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -397,7 +397,7 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/pr34681.ll b/llvm/test/Transforms/LoopVectorize/pr34681.ll
index 0f509a5c4eeb3..a04a4e9eea6fe 100644
--- a/llvm/test/Transforms/LoopVectorize/pr34681.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr34681.ll
@@ -80,7 +80,7 @@ define i32 @foo1(i32 %N, ptr nocapture readnone %A, ptr nocapture readonly %B, i
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <4 x i16> [[TMP27]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP29]] = add <4 x i32> [[VEC_PHI]], [[TMP28]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -185,7 +185,7 @@ define i32 @foo2(i16 zeroext %N, ptr nocapture readnone %A, ptr nocapture readon
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i16> [[TMP24]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP26]] = add <4 x i32> [[VEC_PHI]], [[TMP25]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/pr35773.ll b/llvm/test/Transforms/LoopVectorize/pr35773.ll
index b7165156e28fd..00b9e9133af9b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr35773.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr35773.ll
@@ -17,8 +17,8 @@ define void @doit1(ptr %ptr) {
 ; CHECK-NEXT:    store <4 x i32> [[I32_IV]], ptr [[GEP1]], align 4
 
 ; CHECK-NEXT:    [[MAIN_IV_NEXT]] = add nuw i32 [[MAIN_IV]], 4
-; CHECK-NEXT:    [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], splat (i32 36)
-; CHECK-NEXT:    [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], splat (i8 36)
+; CHECK-NEXT:    [[I32_IV_NEXT]] = add nuw nsw <4 x i32> [[I32_IV]], splat (i32 36)
+; CHECK-NEXT:    [[IV_FROM_TRUNC_NEXT]] = add nuw nsw <4 x i8> [[IV_FROM_TRUNC]], splat (i8 36)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll
index 98963a72c5ad0..310c7729a6b63 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll
@@ -14,7 +14,7 @@ define i16 @duplicate_lcssa(i16 %val) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 -1, i16 -2, i16 -3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw <4 x i16> [[VEC_IND]], splat (i16 1)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 -4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i16> [[VEC_IND]], splat (i16 -4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
index 1bb6454cdeea2..c4a5a8b18dd5d 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -72,12 +72,13 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index ebd532aa5032c..f17866718a0ee 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -27,7 +27,7 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]]
 ; CHECK-NEXT:    [[PREDPHI7]] = select i1 [[C_2]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index cf973affae5f2..5654dbd727f85 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -224,7 +224,7 @@ define void @predicated_phi_dbg(i64 %n, ptr %x) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[X]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP21]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -304,7 +304,7 @@ define void @predicated_phi_dbg(i64 %n, ptr %x) {
 ; DEBUGLOC-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[X]], i64 [[INDEX]], !dbg [[DBG57:![0-9]+]]
 ; DEBUGLOC-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP21]], align 8, !dbg [[DBG58:![0-9]+]]
 ; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG53]]
-; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4), !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4), !dbg [[DBG53]]
 ; DEBUGLOC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG59:![0-9]+]]
 ; DEBUGLOC-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG59]], !llvm.loop [[LOOP60:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
@@ -385,7 +385,7 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]]
 ; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -428,7 +428,7 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; DEBUGLOC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]], !dbg [[DBG77:![0-9]+]]
 ; DEBUGLOC-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP6]], align 4, !dbg [[DBG78:![0-9]+]]
 ; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG75]]
-; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4), !dbg [[DBG76]]
+; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4), !dbg [[DBG76]]
 ; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG79:![0-9]+]]
 ; DEBUGLOC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG79]], !llvm.loop [[LOOP80:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 5f54b0ac7834a..caf1a934ed70a 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -28,7 +28,7 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index c708715c623e6..dfdf1100eb57b 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -365,7 +365,7 @@ define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3
 ; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -571,7 +571,7 @@ define void @reduc_store_middle_store_predicated(ptr %dst, ptr readonly %src) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3
 ; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index c858f201e01fa..d140bc09fe731 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -161,8 +161,8 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP7]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 2 x i64> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -187,7 +187,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-NEXT:    store <vscale x 2 x float> [[TMP15]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    store <vscale x 2 x float> [[TMP16]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 359132f7904cc..d87d39e684993 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -43,7 +43,7 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -119,7 +119,7 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP12]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[STEP_ADD]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 1 x i64> [[STEP_ADD]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
@@ -184,9 +184,9 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i32> [[TMP6]], splat (i32 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i32 [[TMP8]], 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -196,7 +196,7 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index cde2de73b7bfd..70127a6762423 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -25,7 +25,7 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i16> [[PREDPHI]], ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -87,7 +87,7 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
@@ -152,9 +152,9 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
-; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i16> [[VEC_IND3]], splat (i16 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add nuw nsw <2 x i16> [[VEC_IND1]], splat (i16 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add nuw nsw <2 x i16> [[VEC_IND3]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -231,7 +231,7 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i16> [[PREDPHI3]], ptr [[TMP18]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
@@ -283,7 +283,7 @@ define void @duplicated_incoming_blocks_blend(i32 %x, ptr %ptr) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
index b85f2746a0b14..98203a245f863 100644
--- a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
@@ -18,7 +18,7 @@ define void @uitofp_preserve_nneg(ptr %result, i32 %size, float %y) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[RESULT:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX1]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 3b515a2acb1a7..24dc182fe24a1 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -114,7 +114,7 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    store i16 0, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    store i16 0, ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[PREDPHI1]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[PREDPHI1]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 927fefc73ceea..f80d7b695e2af 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -112,7 +112,7 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -168,7 +168,7 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
@@ -272,7 +272,7 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
@@ -328,7 +328,7 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
@@ -383,7 +383,7 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
@@ -520,7 +520,7 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
@@ -569,7 +569,7 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
@@ -625,7 +625,7 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
@@ -729,7 +729,7 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
@@ -785,7 +785,7 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
@@ -841,7 +841,7 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
@@ -937,7 +937,7 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i16 [[TMP2]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 56
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index d6277d657ea7e..0f191b2d8a278 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -112,7 +112,7 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -168,7 +168,7 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
@@ -270,7 +270,7 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
@@ -325,7 +325,7 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
@@ -374,7 +374,7 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
@@ -430,7 +430,7 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
@@ -486,7 +486,7 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
@@ -542,7 +542,7 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index edf04bbcbcdff..7ff10c544f72a 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -53,7 +53,7 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <8 x i64> [[TMP34]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -129,7 +129,7 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <8 x i64> [[TMP35]], ptr [[TMP36]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
@@ -203,7 +203,7 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <8 x i64> [[TMP33]], ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 32873a4e90e81..a5bb07f1fd4ef 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -124,7 +124,7 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4:       middle.block:
@@ -248,7 +248,7 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF2:       middle.block:
@@ -301,7 +301,7 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF4:       middle.block:
@@ -442,7 +442,7 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       middle.block:
@@ -494,7 +494,7 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF4:       middle.block:
@@ -549,7 +549,7 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
 ; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VF2:       middle.block:
@@ -601,7 +601,7 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
 ; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VF4:       middle.block:
@@ -651,7 +651,7 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF2-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; VF2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; VF2:       middle.block:
@@ -689,7 +689,7 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; VF4:       middle.block:
@@ -830,7 +830,7 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VF2:       middle.block:
@@ -883,7 +883,7 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VF4:       middle.block:
@@ -939,7 +939,7 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF2:       middle.block:
@@ -992,7 +992,7 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF4:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index 607d1365098f2..fb962c017156d 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -29,8 +29,8 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       middle.block:
@@ -71,8 +71,8 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       middle.block:
@@ -162,8 +162,8 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4:       middle.block:
@@ -219,8 +219,8 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       middle.block:
@@ -261,8 +261,8 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
@@ -325,8 +325,8 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF2:       middle.block:
@@ -382,8 +382,8 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF4:       middle.block:
@@ -446,8 +446,8 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
@@ -503,8 +503,8 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
@@ -567,8 +567,8 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       middle.block:
@@ -624,8 +624,8 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF4:       middle.block:
@@ -688,8 +688,8 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       middle.block:
@@ -744,8 +744,8 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       middle.block:
@@ -807,8 +807,8 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       middle.block:
@@ -863,8 +863,8 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       middle.block:
@@ -926,8 +926,8 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       middle.block:
@@ -982,8 +982,8 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       middle.block:
@@ -1039,8 +1039,8 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       middle.block:
@@ -1081,8 +1081,8 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       middle.block:
@@ -1138,8 +1138,8 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       middle.block:
@@ -1180,8 +1180,8 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       middle.block:
@@ -1237,8 +1237,8 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       middle.block:
@@ -1279,8 +1279,8 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
 ; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       middle.block:
@@ -1343,8 +1343,8 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF2:       middle.block:
@@ -1400,8 +1400,8 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF4:       middle.block:
@@ -1464,8 +1464,8 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF2:       middle.block:
@@ -1521,8 +1521,8 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF4:       middle.block:
@@ -1585,8 +1585,8 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 4)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF2:       middle.block:
@@ -1642,8 +1642,8 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 8)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF4:       middle.block:
@@ -1706,8 +1706,8 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF2:       middle.block:
@@ -1763,8 +1763,8 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF4:       middle.block:
@@ -1827,8 +1827,8 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF2:       middle.block:
@@ -1884,8 +1884,8 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF4:       middle.block:
@@ -1948,8 +1948,8 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
-; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
+; VF2-NEXT:    [[VEC_IND_NEXT]] = add nsw <2 x i64> [[VEC_IND]], splat (i64 6)
+; VF2-NEXT:    [[VEC_IND_NEXT2]] = add nsw <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF2:       middle.block:
@@ -2005,8 +2005,8 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
-; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 12)
+; VF4-NEXT:    [[VEC_IND_NEXT2]] = add nsw <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF4:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
index 94bc32205ec11..90b4be0f96dce 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-geps.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
@@ -20,7 +20,7 @@ define void @vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index ef678ff759943..1668fd49a22a3 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -223,7 +223,7 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) {
 ; CHECK:        <x1> vector loop: {
 ; CHECK-NEXT:     vector.body:
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<{{.+}}>
+; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<{{.+}}>
 ; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<-9223372036854775808>, ir<%cond>
 ; CHECK-NEXT:     vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep.a> = getelementptr inbounds ir<%a>, vp<[[SCALAR_STEPS]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 84c6cc2675a80..754aee99ed865 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -165,7 +165,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   ir<%i> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:   ir<%i> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   WIDEN ir<%cmp> = icmp ult ir<%i>, ir<5>
 ; CHECK-NEXT: Successor(s): pred.udiv
@@ -534,7 +534,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION nsw ir<0>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     WIDEN ir<%add> = add ir<%iv>, ir<%off>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 3161a0d5e6f5e..88dead4418628 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -276,7 +276,7 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:   ir<%iv> = WIDEN-INDUCTION ir<21>, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:   ir<%iv> = WIDEN-INDUCTION nsw ir<21>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<21> + vp<[[CAN_IV]]> * ir<1>
 ; CHECK-NEXT:   EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]>, vp<[[BTC]]>
@@ -1060,7 +1060,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, i32 %k, ptr noalias %sr
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION ir<%n>, ir<-1>, vp<[[VF]]>
+; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION nsw ir<%n>, ir<-1>, vp<[[VF]]>
 ; CHECK-NEXT:     vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
 ; CHECK-NEXT:     WIDEN ir<%cond> = icmp ult ir<%iv>, ir<%k>
 ; CHECK-NEXT:   Successor(s): pred.store
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
index eaebfebf533ea..909ca14f21639 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
@@ -41,7 +41,7 @@ define void @inner_loop_reduction(ptr noalias nocapture readonly %a.in, ptr noal
 ; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]]
 ; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> align 8 %[[C_PTR]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
-; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], splat (i64 4)
+; CHECK-NEXT: %{{.*}} = add nuw nsw <4 x i64> %[[VEC_INDEX]], splat (i64 4)
 ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
index 180fd84c14450..72ba993defb5a 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
@@ -34,7 +34,7 @@ define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr no
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[C_OUT]], <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP9]], <4 x ptr> align 8 [[TMP7]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
index 48a11fa9182e7..429e964890103 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
@@ -36,7 +36,7 @@ define void @loop_invariant_select(ptr noalias nocapture %out, i1 %select, doubl
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[VECTOR_LATCH]], label %[[FOR2_HEADER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -96,7 +96,7 @@ define void @outer_loop_dependant_select(ptr noalias nocapture %out, double %a,
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR2_HEADER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -157,7 +157,7 @@ define void @inner_loop_dependant_select(ptr noalias nocapture %out, double %a,
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR2_HEADER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -219,7 +219,7 @@ define void @outer_and_inner_loop_dependant_select(ptr noalias nocapture %out, d
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[VECTOR_LATCH]], label %[[FOR2_HEADER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index d08ca8c99e8ba..9bb010c0431d8 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -102,7 +102,7 @@ define void @wide_gep_multiple_indices_some_invariant(ptr noalias %dst, ptr noal
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:

From fd1bdfde14c05f00a802f33717dfa72284317bbf Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 17 Nov 2025 08:45:08 -0500
Subject: [PATCH 014/105] Revert "[clang][SourceManager] Use `getFileLoc` when
 computing `getPresumedLoc`" (#168368)

Reverts llvm/llvm-project#166255

It broke bots:
https://lab.llvm.org/buildbot/#/builders/190/builds/31102
---
 clang/include/clang/Basic/SourceManager.h           |  5 ++---
 clang/lib/Basic/SourceManager.cpp                   |  2 +-
 clang/test/Analysis/plist-macros-with-expansion.cpp |  8 ++++----
 clang/test/C/C23/n2350.c                            |  5 +++--
 clang/test/ExtractAPI/macro_undefined.c             |  4 ++--
 clang/test/FixIt/format.cpp                         |  8 ++++----
 clang/test/Preprocessor/macro_arg_directive.c       |  4 ++--
 clang/test/Preprocessor/print_line_track.c          | 11 ++++++-----
 8 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index f15257a760b8c..bc9e97863556d 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1464,9 +1464,8 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// directives.  This provides a view on the data that a user should see
   /// in diagnostics, for example.
   ///
-  /// If \p Loc is a macro expansion location, the presumed location
-  /// computation uses the spelling location for macro arguments and the
-  /// expansion location for other macro expansions.
+  /// Note that a presumed location is always given as the expansion point of
+  /// an expansion location, not at the spelling location.
   ///
   /// \returns The presumed location of the specified SourceLocation. If the
   /// presumed location cannot be calculated (e.g., because \p Loc is invalid
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 767a765ae4261..b6cc6ec9365f5 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -1435,7 +1435,7 @@ PresumedLoc SourceManager::getPresumedLoc(SourceLocation Loc,
   if (Loc.isInvalid()) return PresumedLoc();
 
   // Presumed locations are always for expansion points.
-  FileIDAndOffset LocInfo = getDecomposedLoc(getFileLoc(Loc));
+  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
 
   bool Invalid = false;
   const SLocEntry &Entry = getSLocEntry(LocInfo.first, &Invalid);
diff --git a/clang/test/Analysis/plist-macros-with-expansion.cpp b/clang/test/Analysis/plist-macros-with-expansion.cpp
index d9a2f94055593..d57bb0f2dd265 100644
--- a/clang/test/Analysis/plist-macros-with-expansion.cpp
+++ b/clang/test/Analysis/plist-macros-with-expansion.cpp
@@ -405,14 +405,14 @@ void commaInBracketsTest() {
   code
 
 void commaInBracesTest() {
-  PASTE_CODE({
+  PASTE_CODE({ // expected-warning{{Dereference of null pointer}}
     // NOTE: If we were to add a new variable here after a comma, we'd get a
     // compilation error, so this test is mainly here to show that this was also
     // investigated.
     //
     // int *ptr = nullptr, a;
     int *ptr = nullptr;
-    *ptr = 5; // expected-warning{{Dereference of null pointer}}
+    *ptr = 5;
   })
 }
 
@@ -425,14 +425,14 @@ void commaInBracesTest() {
 // CHECK-NEXT:      <key>col</key><integer>3</integer>
 // CHECK-NEXT:      <key>file</key><integer>0</integer>
 // CHECK-NEXT:     </dict>
-// CHECK-NEXT:     <key>name</key><string>PASTE_CODE({
+// CHECK-NEXT:     <key>name</key><string>PASTE_CODE({ // expected-
 // CHECK-NEXT:    // NOTE: If we were to add a new variable here after a comma, we&apos;d get a
 // CHECK-NEXT:    // compilation error, so this test is mainly here to show that this was also
 // CHECK-NEXT:    // investigated.
 // CHECK-NEXT:    //
 // CHECK-NEXT:    // int *ptr = nullptr, a;
 // CHECK-NEXT:    int *ptr = nullptr;
-// CHECK-NEXT:    *ptr = 5; // expected-
+// CHECK-NEXT:    *ptr = 5;
 // CHECK-NEXT:  })</string>
 // CHECK-NEXT:     <key>expansion</key><string>{int *ptr =nullptr ;*ptr =5;}</string>
 // CHECK-NEXT:    </dict>
diff --git a/clang/test/C/C23/n2350.c b/clang/test/C/C23/n2350.c
index 96b8c511d5716..af0ca6d79be5e 100644
--- a/clang/test/C/C23/n2350.c
+++ b/clang/test/C/C23/n2350.c
@@ -47,10 +47,11 @@ int struct_in_second_param(void) {
 
 int macro(void) {
   return offsetof(struct A // cpp-error {{'A' cannot be defined in a type specifier}} \
-                              expected-warning {{defining a type within 'offsetof' is a C23 extension}}
+                              expected-warning 2 {{defining a type within 'offsetof' is a C23 extension}}
   {
     int a;
-    struct B // expected-warning {{defining a type within 'offsetof' is a C23 extension}}
+    struct B // verifier seems to think the error is emitted by the macro
+             // In fact the location of the error is "B" on the line above
     {
       int c;
       int d;
diff --git a/clang/test/ExtractAPI/macro_undefined.c b/clang/test/ExtractAPI/macro_undefined.c
index 1d697db1e1613..7bb50af380c24 100644
--- a/clang/test/ExtractAPI/macro_undefined.c
+++ b/clang/test/ExtractAPI/macro_undefined.c
@@ -89,7 +89,7 @@ FUNC_GEN(bar, const int *, unsigned);
       },
       "location": {
         "position": {
-          "character": 9,
+          "character": 0,
           "line": 2
         },
         "uri": "file://INPUT_DIR/input.h"
@@ -241,7 +241,7 @@ FUNC_GEN(bar, const int *, unsigned);
       },
       "location": {
         "position": {
-          "character": 9,
+          "character": 0,
           "line": 3
         },
         "uri": "file://INPUT_DIR/input.h"
diff --git a/clang/test/FixIt/format.cpp b/clang/test/FixIt/format.cpp
index db642b60ffd95..d663c0fb35e13 100644
--- a/clang/test/FixIt/format.cpp
+++ b/clang/test/FixIt/format.cpp
@@ -56,9 +56,9 @@ void a(N::E NEVal, S *SPtr, S &SRef) {
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:17}:")"
 
-  LOG(
+  LOG( // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
       "%d",
-      SPtr->Type // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
+      SPtr->Type
   );
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:7-[[@LINE-2]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:17-[[@LINE-3]]:17}:")"
@@ -68,8 +68,8 @@ void a(N::E NEVal, S *SPtr, S &SRef) {
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:")"
 
-  LOG("%d",
-      SRef.Type); // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
+  LOG("%d", // expected-warning{{format specifies type 'int' but the argument has type 'N::E'}}
+      SRef.Type);
   // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:7-[[@LINE-1]]:7}:"static_cast<int>("
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:16-[[@LINE-2]]:16}:")"
 
diff --git a/clang/test/Preprocessor/macro_arg_directive.c b/clang/test/Preprocessor/macro_arg_directive.c
index c612aa545a2a9..929a03d70d025 100644
--- a/clang/test/Preprocessor/macro_arg_directive.c
+++ b/clang/test/Preprocessor/macro_arg_directive.c
@@ -18,7 +18,7 @@ void fail(const char *);
  ({ int result = 0; __VA_ARGS__; if (!result) { fail(#__VA_ARGS__); }; result })
 
 static inline int f(int k) {
-  return MUNCH( // expected-note {{to match this '('}} expected-error {{returning 'void'}} expected-note {{expansion of macro 'MUNCH' requested here}}
+  return MUNCH( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{returning 'void'}} expected-note {{expansion of macro 'MUNCH' requested here}}
     if (k < 3)
       result = 24;
     else if (k > 4)
@@ -27,6 +27,6 @@ static inline int f(int k) {
 
 #include "macro_arg_directive.h" // expected-error {{embedding a #include directive within macro arguments is not supported}}
 
-int g(int k) { // expected-error {{expected ')'}}
+int g(int k) {
   return f(k) + f(k-1));
 }
diff --git a/clang/test/Preprocessor/print_line_track.c b/clang/test/Preprocessor/print_line_track.c
index 56f30073e3e86..156ae22693b85 100644
--- a/clang/test/Preprocessor/print_line_track.c
+++ b/clang/test/Preprocessor/print_line_track.c
@@ -1,9 +1,9 @@
-/* RUN: %clang_cc1 -E %s | grep -z 'a.3'
- * RUN: %clang_cc1 -E %s | grep -z 'b.16'
- * RUN: %clang_cc1 -E -P %s | grep -z 'a.3'
- * RUN: %clang_cc1 -E -P %s | grep -z 'b.16'
+/* RUN: %clang_cc1 -E %s | grep 'a 3'
+ * RUN: %clang_cc1 -E %s | grep 'b 16'
+ * RUN: %clang_cc1 -E -P %s | grep 'a 3'
+ * RUN: %clang_cc1 -E -P %s | grep 'b 16'
  * RUN: %clang_cc1 -E %s | not grep '# 0 '
- * RUN: %clang_cc1 -E -P %s | count 4
+ * RUN: %clang_cc1 -E -P %s | count 2
  * PR1848 PR3437 PR7360
 */
 
@@ -14,3 +14,4 @@ t(a
 
 t(b
 __LINE__)
+

From e70e9ec3b83757761ccbba217a566d77b561ec53 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 17 Nov 2025 08:02:36 -0600
Subject: [PATCH 015/105] [flang][OpenMP] Store Block in OpenMPLoopConstruct,
 add access functions (#168078)

Instead of storing a variant with specific types, store parser::Block as
the body. Add two access functions to make the traversal of the nest
simpler.

This will allow storing loop-nest sequences in the future.
---
 flang/include/flang/Parser/parse-tree.h       |  10 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  37 +++--
 flang/lib/Lower/OpenMP/Utils.cpp              |  12 +-
 flang/lib/Parser/parse-tree.cpp               |  16 +++
 flang/lib/Parser/unparse.cpp                  |   6 -
 flang/lib/Semantics/canonicalize-omp.cpp      |  11 +-
 flang/lib/Semantics/check-omp-loop.cpp        |  96 +++++--------
 flang/lib/Semantics/resolve-directives.cpp    | 130 +++++++-----------
 flang/lib/Semantics/rewrite-parse-tree.cpp    |  19 +--
 flang/test/Parser/OpenMP/bind-clause.f90      |   2 +-
 .../Parser/OpenMP/declare-reduction-multi.f90 |  12 +-
 .../OpenMP/declare-reduction-unparse.f90      |   3 +-
 flang/test/Parser/OpenMP/do-tile-size.f90     |  12 +-
 .../loop-transformation-construct01.f90       |  58 ++++----
 .../loop-transformation-construct02.f90       |  81 +++++------
 .../loop-transformation-construct03.f90       |  71 +++++-----
 .../test/Parser/OpenMP/transparent-clause.f90 |   3 +-
 flang/test/Parser/OpenMP/unroll-heuristic.f90 |  33 ++---
 flang/test/Semantics/OpenMP/simd-only.f90     |   8 +-
 19 files changed, 283 insertions(+), 337 deletions(-)

diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index b1765f927d6c9..60d2ad0b764b9 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -5345,12 +5345,10 @@ struct OmpEndLoopDirective : public OmpEndDirective {
 };
 
 // OpenMP directives enclosing do loop
-using NestedConstruct =
-    std::variant<DoConstruct, common::Indirection<OpenMPLoopConstruct>>;
 struct OpenMPLoopConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPLoopConstruct);
   OpenMPLoopConstruct(OmpBeginLoopDirective &&a)
-      : t({std::move(a), std::nullopt, std::nullopt}) {}
+      : t({std::move(a), Block{}, std::nullopt}) {}
 
   const OmpBeginLoopDirective &BeginDir() const {
     return std::get<OmpBeginLoopDirective>(t);
@@ -5358,8 +5356,10 @@ struct OpenMPLoopConstruct {
   const std::optional<OmpEndLoopDirective> &EndDir() const {
     return std::get<std::optional<OmpEndLoopDirective>>(t);
   }
-  std::tuple<OmpBeginLoopDirective, std::optional<NestedConstruct>,
-      std::optional<OmpEndLoopDirective>>
+  const DoConstruct *GetNestedLoop() const;
+  const OpenMPLoopConstruct *GetNestedConstruct() const;
+
+  std::tuple<OmpBeginLoopDirective, Block, std::optional<OmpEndLoopDirective>>
       t;
 };
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index c3f670c62da06..f822fe3c8dd71 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3962,27 +3962,22 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
 
   mlir::Location currentLocation = converter.genLocation(beginSpec.source);
 
-  auto &optLoopCons =
-      std::get<std::optional<parser::NestedConstruct>>(loopConstruct.t);
-  if (optLoopCons.has_value()) {
-    if (auto *ompNestedLoopCons{
-            std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
-                &*optLoopCons)}) {
-      llvm::omp::Directive nestedDirective =
-          parser::omp::GetOmpDirectiveName(*ompNestedLoopCons).v;
-      switch (nestedDirective) {
-      case llvm::omp::Directive::OMPD_tile:
-        // Skip OMPD_tile since the tile sizes will be retrieved when
-        // generating the omp.loop_nest op.
-        break;
-      default: {
-        unsigned version = semaCtx.langOptions().OpenMPVersion;
-        TODO(currentLocation,
-             "Applying a loop-associated on the loop generated by the " +
-                 llvm::omp::getOpenMPDirectiveName(nestedDirective, version) +
-                 " construct");
-      }
-      }
+  if (const parser::OpenMPLoopConstruct *ompNestedLoopCons =
+          loopConstruct.GetNestedConstruct()) {
+    llvm::omp::Directive nestedDirective =
+        parser::omp::GetOmpDirectiveName(*ompNestedLoopCons).v;
+    switch (nestedDirective) {
+    case llvm::omp::Directive::OMPD_tile:
+      // Skip OMPD_tile since the tile sizes will be retrieved when
+      // generating the omp.loop_nest op.
+      break;
+    default: {
+      unsigned version = semaCtx.langOptions().OpenMPVersion;
+      TODO(currentLocation,
+           "Applying a loop-associated on the loop generated by the " +
+               llvm::omp::getOpenMPDirectiveName(nestedDirective, version) +
+               " construct");
+    }
     }
   }
 
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index eda4d0782f486..7d7a4869ab3a6 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -779,17 +779,9 @@ static void processTileSizesFromOpenMPConstruct(
   if (!ompCons)
     return;
   if (auto *ompLoop{std::get_if<parser::OpenMPLoopConstruct>(&ompCons->u)}) {
-    const auto &nestedOptional =
-        std::get<std::optional<parser::NestedConstruct>>(ompLoop->t);
-    assert(nestedOptional.has_value() &&
-           "Expected a DoConstruct or OpenMPLoopConstruct");
-    const auto *innerConstruct =
-        std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
-            &(nestedOptional.value()));
-    if (innerConstruct) {
-      const auto &innerLoopDirective = innerConstruct->value();
+    if (auto *innerConstruct = ompLoop->GetNestedConstruct()) {
       const parser::OmpDirectiveSpecification &innerBeginSpec =
-          innerLoopDirective.BeginDir();
+          innerConstruct->BeginDir();
       if (innerBeginSpec.DirId() == llvm::omp::Directive::OMPD_tile) {
         // Get the size values from parse tree and convert to a vector.
         for (const auto &clause : innerBeginSpec.Clauses().v) {
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index ad0016e1404f9..60e51895cdcea 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Parser/parse-tree.h"
+
 #include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
+#include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/tools.h"
 #include "flang/Parser/user-state.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -432,6 +434,20 @@ const OmpClauseList &OmpDirectiveSpecification::Clauses() const {
   return empty;
 }
 
+const DoConstruct *OpenMPLoopConstruct::GetNestedLoop() const {
+  if (auto &body{std::get<Block>(t)}; !body.empty()) {
+    return Unwrap<DoConstruct>(body.front());
+  }
+  return nullptr;
+}
+
+const OpenMPLoopConstruct *OpenMPLoopConstruct::GetNestedConstruct() const {
+  if (auto &body{std::get<Block>(t)}; !body.empty()) {
+    return Unwrap<OpenMPLoopConstruct>(body.front());
+  }
+  return nullptr;
+}
+
 static bool InitCharBlocksFromStrings(llvm::MutableArrayRef<CharBlock> blocks,
     llvm::ArrayRef<std::string> strings) {
   for (auto [i, n] : llvm::enumerate(strings)) {
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index e3bc3cdc42ffb..f81200d092b11 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2706,12 +2706,6 @@ class UnparseVisitor {
     Put("\n");
     EndOpenMP();
   }
-  void Unparse(const OpenMPLoopConstruct &x) {
-    Walk(std::get<OmpBeginLoopDirective>(x.t));
-    Walk(std::get<std::optional<std::variant<DoConstruct,
-            common::Indirection<parser::OpenMPLoopConstruct>>>>(x.t));
-    Walk(std::get<std::optional<OmpEndLoopDirective>>(x.t));
-  }
   void Unparse(const BasedPointer &x) {
     Put('('), Walk(std::get<0>(x.t)), Put(","), Walk(std::get<1>(x.t));
     Walk("(", std::get<std::optional<ArraySpec>>(x.t), ")"), Put(')');
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index a11c5250b1ab4..0cec1969e0978 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -143,6 +143,8 @@ class CanonicalizationOfOmp {
           parser::ToUpperCaseLetters(dirName.source.ToString()));
     };
 
+    auto &body{std::get<parser::Block>(x.t)};
+
     nextIt = it;
     while (++nextIt != block.end()) {
       // Ignore compiler directives.
@@ -152,9 +154,7 @@ class CanonicalizationOfOmp {
       if (auto *doCons{GetConstructIf<parser::DoConstruct>(*nextIt)}) {
         if (doCons->GetLoopControl()) {
           // move DoConstruct
-          std::get<std::optional<std::variant<parser::DoConstruct,
-              common::Indirection<parser::OpenMPLoopConstruct>>>>(x.t) =
-              std::move(*doCons);
+          body.push_back(std::move(*nextIt));
           nextIt = block.erase(nextIt);
           // try to match OmpEndLoopDirective
           if (nextIt != block.end()) {
@@ -198,10 +198,7 @@ class CanonicalizationOfOmp {
             ++endIt;
           }
           RewriteOpenMPLoopConstruct(*ompLoopCons, block, nextIt);
-          auto &ompLoop = std::get<std::optional<parser::NestedConstruct>>(x.t);
-          ompLoop =
-              std::optional<parser::NestedConstruct>{parser::NestedConstruct{
-                  common::Indirection{std::move(*ompLoopCons)}}};
+          body.push_back(std::move(*nextIt));
           nextIt = block.erase(nextIt);
         } else if (nestedBeginName.v == llvm::omp::Directive::OMPD_unroll &&
             beginName.v == llvm::omp::Directive::OMPD_tile) {
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index aaaa2d6e78280..3d3596b500880 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -285,13 +285,9 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
   }
   SetLoopInfo(x);
 
-  auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
-  if (optLoopCons.has_value()) {
-    if (const auto &doConstruct{
-            std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
-      const auto &doBlock{std::get<parser::Block>(doConstruct->t)};
-      CheckNoBranching(doBlock, beginName.v, beginName.source);
-    }
+  if (const auto *doConstruct{x.GetNestedLoop()}) {
+    const auto &doBlock{std::get<parser::Block>(doConstruct->t)};
+    CheckNoBranching(doBlock, beginName.v, beginName.source);
   }
   CheckLoopItrVariableIsInt(x);
   CheckAssociatedLoopConstraints(x);
@@ -314,46 +310,34 @@ const parser::Name OmpStructureChecker::GetLoopIndex(
 }
 
 void OmpStructureChecker::SetLoopInfo(const parser::OpenMPLoopConstruct &x) {
-  auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
-  if (optLoopCons.has_value()) {
-    if (const auto &loopConstruct{
-            std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
-      const parser::DoConstruct *loop{&*loopConstruct};
-      if (loop && loop->IsDoNormal()) {
-        const parser::Name &itrVal{GetLoopIndex(loop)};
-        SetLoopIv(itrVal.symbol);
-      }
+  if (const auto *loop{x.GetNestedLoop()}) {
+    if (loop->IsDoNormal()) {
+      const parser::Name &itrVal{GetLoopIndex(loop)};
+      SetLoopIv(itrVal.symbol);
     }
   }
 }
 
 void OmpStructureChecker::CheckLoopItrVariableIsInt(
     const parser::OpenMPLoopConstruct &x) {
-  auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
-  if (optLoopCons.has_value()) {
-    if (const auto &loopConstruct{
-            std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
-
-      for (const parser::DoConstruct *loop{&*loopConstruct}; loop;) {
-        if (loop->IsDoNormal()) {
-          const parser::Name &itrVal{GetLoopIndex(loop)};
-          if (itrVal.symbol) {
-            const auto *type{itrVal.symbol->GetType()};
-            if (!type->IsNumeric(TypeCategory::Integer)) {
-              context_.Say(itrVal.source,
-                  "The DO loop iteration"
-                  " variable must be of the type integer."_err_en_US,
-                  itrVal.ToString());
-            }
-          }
+  for (const parser::DoConstruct *loop{x.GetNestedLoop()}; loop;) {
+    if (loop->IsDoNormal()) {
+      const parser::Name &itrVal{GetLoopIndex(loop)};
+      if (itrVal.symbol) {
+        const auto *type{itrVal.symbol->GetType()};
+        if (!type->IsNumeric(TypeCategory::Integer)) {
+          context_.Say(itrVal.source,
+              "The DO loop iteration"
+              " variable must be of the type integer."_err_en_US,
+              itrVal.ToString());
         }
-        // Get the next DoConstruct if block is not empty.
-        const auto &block{std::get<parser::Block>(loop->t)};
-        const auto it{block.begin()};
-        loop = it != block.end() ? parser::Unwrap<parser::DoConstruct>(*it)
-                                 : nullptr;
       }
     }
+    // Get the next DoConstruct if block is not empty.
+    const auto &block{std::get<parser::Block>(loop->t)};
+    const auto it{block.begin()};
+    loop =
+        it != block.end() ? parser::Unwrap<parser::DoConstruct>(*it) : nullptr;
   }
 }
 
@@ -417,29 +401,23 @@ void OmpStructureChecker::CheckDistLinear(
 
     // Match the loop index variables with the collected symbols from linear
     // clauses.
-    auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
-    if (optLoopCons.has_value()) {
-      if (const auto &loopConstruct{
-              std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
-        for (const parser::DoConstruct *loop{&*loopConstruct}; loop;) {
-          if (loop->IsDoNormal()) {
-            const parser::Name &itrVal{GetLoopIndex(loop)};
-            if (itrVal.symbol) {
-              // Remove the symbol from the collected set
-              indexVars.erase(&itrVal.symbol->GetUltimate());
-            }
-            collapseVal--;
-            if (collapseVal == 0) {
-              break;
-            }
-          }
-          // Get the next DoConstruct if block is not empty.
-          const auto &block{std::get<parser::Block>(loop->t)};
-          const auto it{block.begin()};
-          loop = it != block.end() ? parser::Unwrap<parser::DoConstruct>(*it)
-                                   : nullptr;
+    for (const parser::DoConstruct *loop{x.GetNestedLoop()}; loop;) {
+      if (loop->IsDoNormal()) {
+        const parser::Name &itrVal{GetLoopIndex(loop)};
+        if (itrVal.symbol) {
+          // Remove the symbol from the collected set
+          indexVars.erase(&itrVal.symbol->GetUltimate());
+        }
+        collapseVal--;
+        if (collapseVal == 0) {
+          break;
         }
       }
+      // Get the next DoConstruct if block is not empty.
+      const auto &block{std::get<parser::Block>(loop->t)};
+      const auto it{block.begin()};
+      loop = it != block.end() ? parser::Unwrap<parser::DoConstruct>(*it)
+                               : nullptr;
     }
 
     // Show error for the remaining variables
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 7b1a3ba493f5f..68d007bc2de7e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2047,13 +2047,9 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
   SetContextAssociatedLoopLevel(GetNumAffectedLoopsFromLoopConstruct(x));
 
   if (beginName.v == llvm::omp::Directive::OMPD_do) {
-    auto &optLoopCons = std::get<std::optional<parser::NestedConstruct>>(x.t);
-    if (optLoopCons.has_value()) {
-      if (const auto &doConstruct{
-              std::get_if<parser::DoConstruct>(&*optLoopCons)}) {
-        if (doConstruct->IsDoWhile()) {
-          return true;
-        }
+    if (const parser::DoConstruct *doConstruct{x.GetNestedLoop()}) {
+      if (doConstruct->IsDoWhile()) {
+        return true;
       }
     }
   }
@@ -2210,18 +2206,8 @@ void OmpAttributeVisitor::CollectNumAffectedLoopsFromInnerLoopContruct(
     const parser::OpenMPLoopConstruct &x,
     llvm::SmallVector<std::int64_t> &levels,
     llvm::SmallVector<const parser::OmpClause *> &clauses) {
-
-  const auto &nestedOptional =
-      std::get<std::optional<parser::NestedConstruct>>(x.t);
-  assert(nestedOptional.has_value() &&
-      "Expected a DoConstruct or OpenMPLoopConstruct");
-  const auto *innerConstruct =
-      std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
-          &(nestedOptional.value()));
-
-  if (innerConstruct) {
-    CollectNumAffectedLoopsFromLoopConstruct(
-        innerConstruct->value(), levels, clauses);
+  if (auto *innerConstruct{x.GetNestedConstruct()}) {
+    CollectNumAffectedLoopsFromLoopConstruct(*innerConstruct, levels, clauses);
   }
 }
 
@@ -2286,24 +2272,12 @@ void OmpAttributeVisitor::CheckPerfectNestAndRectangularLoop(
 
   // Find the associated region by skipping nested loop-associated constructs
   // such as loop transformations
-  const parser::NestedConstruct *innermostAssocRegion{nullptr};
   const parser::OpenMPLoopConstruct *innermostConstruct{&x};
-  while (const auto &innerAssocStmt{
-      std::get<std::optional<parser::NestedConstruct>>(
-          innermostConstruct->t)}) {
-    innermostAssocRegion = &(innerAssocStmt.value());
-    if (const auto *innerConstruct{
-            std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
-                innermostAssocRegion)}) {
-      innermostConstruct = &innerConstruct->value();
-    } else {
-      break;
-    }
+  while (auto *nested{innermostConstruct->GetNestedConstruct()}) {
+    innermostConstruct = nested;
   }
 
-  if (!innermostAssocRegion)
-    return;
-  const auto &outer{std::get_if<parser::DoConstruct>(innermostAssocRegion)};
+  const auto *outer{innermostConstruct->GetNestedLoop()};
   if (!outer)
     return;
 
@@ -2398,61 +2372,51 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
   const parser::OmpClause *clause{GetAssociatedClause()};
   bool hasCollapseClause{
       clause ? (clause->Id() == llvm::omp::OMPC_collapse) : false};
-  const parser::OpenMPLoopConstruct *innerMostLoop = &x;
-  const parser::NestedConstruct *innerMostNest = nullptr;
-  while (auto &optLoopCons{
-      std::get<std::optional<parser::NestedConstruct>>(innerMostLoop->t)}) {
-    innerMostNest = &(optLoopCons.value());
-    if (const auto *innerLoop{
-            std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
-                innerMostNest)}) {
-      innerMostLoop = &(innerLoop->value());
-    } else
-      break;
-  }
 
-  if (innerMostNest) {
-    if (const auto &outer{std::get_if<parser::DoConstruct>(innerMostNest)}) {
-      for (const parser::DoConstruct *loop{&*outer}; loop && level > 0;
-          --level) {
-        if (loop->IsDoConcurrent()) {
-          // DO CONCURRENT is explicitly allowed for the LOOP construct so long
-          // as there isn't a COLLAPSE clause
-          if (isLoopConstruct) {
-            if (hasCollapseClause) {
-              // hasCollapseClause implies clause != nullptr
-              context_.Say(clause->source,
-                  "DO CONCURRENT loops cannot be used with the COLLAPSE clause."_err_en_US);
-            }
-          } else {
-            auto &stmt =
-                std::get<parser::Statement<parser::NonLabelDoStmt>>(loop->t);
-            context_.Say(stmt.source,
-                "DO CONCURRENT loops cannot form part of a loop nest."_err_en_US);
+  const parser::OpenMPLoopConstruct *innerMostNest = &x;
+  while (auto *nested{innerMostNest->GetNestedConstruct()}) {
+    innerMostNest = nested;
+  }
+
+  if (const auto *outer{innerMostNest->GetNestedLoop()}) {
+    for (const parser::DoConstruct *loop{&*outer}; loop && level > 0; --level) {
+      if (loop->IsDoConcurrent()) {
+        // DO CONCURRENT is explicitly allowed for the LOOP construct so long
+        // as there isn't a COLLAPSE clause
+        if (isLoopConstruct) {
+          if (hasCollapseClause) {
+            // hasCollapseClause implies clause != nullptr
+            context_.Say(clause->source,
+                "DO CONCURRENT loops cannot be used with the COLLAPSE clause."_err_en_US);
           }
+        } else {
+          auto &stmt =
+              std::get<parser::Statement<parser::NonLabelDoStmt>>(loop->t);
+          context_.Say(stmt.source,
+              "DO CONCURRENT loops cannot form part of a loop nest."_err_en_US);
         }
-        // go through all the nested do-loops and resolve index variables
-        const parser::Name *iv{GetLoopIndex(*loop)};
-        if (iv) {
-          if (auto *symbol{ResolveOmp(*iv, ivDSA, currScope())}) {
-            SetSymbolDSA(*symbol, {Symbol::Flag::OmpPreDetermined, ivDSA});
-            iv->symbol = symbol; // adjust the symbol within region
-            AddToContextObjectWithDSA(*symbol, ivDSA);
-          }
-
-          const auto &block{std::get<parser::Block>(loop->t)};
-          const auto it{block.begin()};
-          loop = it != block.end() ? GetDoConstructIf(*it) : nullptr;
+      }
+      // go through all the nested do-loops and resolve index variables
+      const parser::Name *iv{GetLoopIndex(*loop)};
+      if (iv) {
+        if (auto *symbol{ResolveOmp(*iv, ivDSA, currScope())}) {
+          SetSymbolDSA(*symbol, {Symbol::Flag::OmpPreDetermined, ivDSA});
+          iv->symbol = symbol; // adjust the symbol within region
+          AddToContextObjectWithDSA(*symbol, ivDSA);
         }
+
+        const auto &block{std::get<parser::Block>(loop->t)};
+        const auto it{block.begin()};
+        loop = it != block.end() ? GetDoConstructIf(*it) : nullptr;
       }
-      CheckAssocLoopLevel(level, GetAssociatedClause());
-    } else {
-      context_.Say(GetContext().directiveSource,
-          "A DO loop must follow the %s directive"_err_en_US,
-          parser::ToUpperCaseLetters(
-              llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
-                  .str()));
     }
+    CheckAssocLoopLevel(level, GetAssociatedClause());
+  } else {
+    context_.Say(GetContext().directiveSource,
+        "A DO loop must follow the %s directive"_err_en_US,
+        parser::ToUpperCaseLetters(
+            llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
+                .str()));
   }
 }
 
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 5b7dab309eda7..b5a07680a3377 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -195,18 +195,16 @@ void RewriteMutator::OpenMPSimdOnly(
             ++it;
             continue;
           }
-          auto &nest =
-              std::get<std::optional<parser::NestedConstruct>>(ompLoop->t);
-
           if (auto *doConstruct =
-                  std::get_if<parser::DoConstruct>(&nest.value())) {
+                  const_cast<parser::DoConstruct *>(ompLoop->GetNestedLoop())) {
             auto &loopBody = std::get<parser::Block>(doConstruct->t);
             // We can only remove some constructs from a loop when it's _not_ a
             // OpenMP simd loop
-            OpenMPSimdOnly(loopBody, /*isNonSimdLoopBody=*/true);
-            auto newDoConstruct = std::move(*doConstruct);
+            OpenMPSimdOnly(const_cast<parser::Block &>(loopBody),
+                /*isNonSimdLoopBody=*/true);
+
             auto newLoop = parser::ExecutionPartConstruct{
-                parser::ExecutableConstruct{std::move(newDoConstruct)}};
+                parser::ExecutableConstruct{std::move(*doConstruct)}};
             it = block.erase(it);
             block.insert(it, std::move(newLoop));
             continue;
@@ -386,11 +384,8 @@ bool RewriteMutator::Pre(parser::OpenMPLoopConstruct &ompLoop) {
     // If we're looking at a non-simd OpenMP loop, we need to explicitly
     // call OpenMPSimdOnly on the nested loop block while indicating where
     // the block comes from.
-    auto &nest = std::get<std::optional<parser::NestedConstruct>>(ompLoop.t);
-    if (!nest.has_value()) {
-      return true;
-    }
-    if (auto *doConstruct = std::get_if<parser::DoConstruct>(&*nest)) {
+    if (auto *doConstruct =
+            const_cast<parser::DoConstruct *>(ompLoop.GetNestedLoop())) {
       auto &innerBlock = std::get<parser::Block>(doConstruct->t);
       OpenMPSimdOnly(innerBlock, /*isNonSimdLoopBody=*/true);
     }
diff --git a/flang/test/Parser/OpenMP/bind-clause.f90 b/flang/test/Parser/OpenMP/bind-clause.f90
index a4fb3aa66c1c8..6910ffbba204f 100644
--- a/flang/test/Parser/OpenMP/bind-clause.f90
+++ b/flang/test/Parser/OpenMP/bind-clause.f90
@@ -22,5 +22,5 @@ subroutine f00
 !PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = loop
 !PARSE-TREE: | | OmpClauseList -> OmpClause -> Bind -> OmpBindClause -> Binding = Parallel
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 
diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
index 88566613bd412..f8104254aa6b1 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-multi.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
@@ -198,7 +198,8 @@ program omp_examples
 !PARSE-TREE: | | | Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
 !PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'sum'
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 
   do i = 1, n
      sum%r = sum%r + values(i)%r
@@ -215,7 +216,8 @@ program omp_examples
 !PARSE-TREE: | | | Modifier -> OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
 !PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'prod'
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 
   do i = 1, n
      prod%r = prod%r * (values(i)%r+0.6)
@@ -232,7 +234,8 @@ program omp_examples
 !PARSE-TREE: | | | Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
 !PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'big'
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 
   do i = 1, n
      big = mymax(values(i), big)
@@ -249,7 +252,8 @@ program omp_examples
 !PARSE-TREE: | | | Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
 !PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'small'
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 
   do i = 1, n
      small%r = min(values(i)%r, small%r)
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
index 7897eb0fb46f0..31431f5d20c45 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
@@ -70,7 +70,8 @@ end subroutine initme
 !PARSE-TREE: | | | Modifier -> OmpReductionIdentifier -> ProcedureDesignator -> Name = 'red_add'
 !PARSE-TREE: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'res'
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 
   do i=1,n
      res=res+x(i)
diff --git a/flang/test/Parser/OpenMP/do-tile-size.f90 b/flang/test/Parser/OpenMP/do-tile-size.f90
index 9ba6a3a6c2c41..b8d175c236bf9 100644
--- a/flang/test/Parser/OpenMP/do-tile-size.f90
+++ b/flang/test/Parser/OpenMP/do-tile-size.f90
@@ -21,9 +21,11 @@ subroutine openmp_do_tiles(x)
 
 !PARSE-TREE:| | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE:| | | OmpBeginLoopDirective
-!PARSE-TREE:| | | OpenMPLoopConstruct
-!PARSE-TREE:| | | | OmpBeginLoopDirective
-!PARSE-TREE:| | | | | OmpDirectiveName -> llvm::omp::Directive = tile
-!PARSE-TREE:| | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
-!PARSE-TREE: | | | | DoConstruct
+!PARSE-TREE:| | | Block
+!PARSE-TREE:| | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE:| | | | | OmpBeginLoopDirective
+!PARSE-TREE:| | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
+!PARSE-TREE:| | | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
+!PARSE-TREE:| | | | | Block
+!PARSE-TREE:| | | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
 END subroutine openmp_do_tiles
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct01.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct01.f90
index 9595889b1bf98..8b314d8d823db 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct01.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct01.f90
@@ -24,40 +24,42 @@ subroutine loop_transformation_construct
 !CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
 !CHECK-PARSE-NEXT: | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | OpenMPLoopConstruct
-!CHECK-PARSE-NEXT: | | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
-!CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
-!CHECK-PARSE-NEXT: | | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | | DoConstruct
-!CHECK-PARSE-NEXT: | | | | | NonLabelDoStmt
-!CHECK-PARSE-NEXT: | | | | | | LoopControl -> LoopBounds
-!CHECK-PARSE-NEXT: | | | | | | | Scalar -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | Scalar -> Expr = '1_4'
-!CHECK-PARSE-NEXT: | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-PARSE-NEXT: | | | | | | | Scalar -> Expr = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | Designator -> DataRef -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | Block
+!CHECK-PARSE-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!CHECK-PARSE-NEXT: | | | | | OmpBeginLoopDirective
+!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | | Block
-!CHECK-PARSE-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'y(int(i,kind=8))=5_4*y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | Variable = 'y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | | Designator -> DataRef -> ArrayElement
-!CHECK-PARSE-NEXT: | | | | | | | | | DataRef -> Name = 'y'
-!CHECK-PARSE-NEXT: | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
+!CHECK-PARSE-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+!CHECK-PARSE-NEXT: | | | | | | | NonLabelDoStmt
+!CHECK-PARSE-NEXT: | | | | | | | | LoopControl -> LoopBounds
+!CHECK-PARSE-NEXT: | | | | | | | | | Scalar -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | Scalar -> Expr = '1_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-PARSE-NEXT: | | | | | | | | | Scalar -> Expr = 'i'
 !CHECK-PARSE-NEXT: | | | | | | | | | | Designator -> DataRef -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | Expr = '5_4*y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | | Multiply
-!CHECK-PARSE-NEXT: | | | | | | | | | Expr = 'y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | Block
+!CHECK-PARSE-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'y(int(i,kind=8))=5_4*y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | Variable = 'y(int(i,kind=8))'
 !CHECK-PARSE-NEXT: | | | | | | | | | | Designator -> DataRef -> ArrayElement
 !CHECK-PARSE-NEXT: | | | | | | | | | | | DataRef -> Name = 'y'
 !CHECK-PARSE-NEXT: | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
 !CHECK-PARSE-NEXT: | | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | | Expr = '5_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '5'
-!CHECK-PARSE-NEXT: | | | | | EndDoStmt ->
-!CHECK-PARSE-NEXT: | | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
-!CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
-!CHECK-PARSE-NEXT: | | | | | Flags = None
+!CHECK-PARSE-NEXT: | | | | | | | | | Expr = '5_4*y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | | Multiply
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Expr = 'y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | Designator -> DataRef -> ArrayElement
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | DataRef -> Name = 'y'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Expr = '5_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '5'
+!CHECK-PARSE-NEXT: | | | | | | | EndDoStmt ->
+!CHECK-PARSE-NEXT: | | | | | OmpEndLoopDirective
+!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | OmpEndLoopDirective
 !CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct02.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
index a876c77a274b5..5b5b591b35f8f 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
@@ -26,50 +26,53 @@ subroutine loop_transformation_construct
 !CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
 !CHECK-PARSE-NEXT: | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | OpenMPLoopConstruct
-!CHECK-PARSE-NEXT: | | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
-!CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
-!CHECK-PARSE-NEXT: | | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | | OpenMPLoopConstruct
+!CHECK-PARSE-NEXT: | | | Block
+!CHECK-PARSE-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | | | OmpBeginLoopDirective
-!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
-!CHECK-PARSE-NEXT: | | | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
-!CHECK-PARSE-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
+!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
+!CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
 !CHECK-PARSE-NEXT: | | | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | | | DoConstruct
-!CHECK-PARSE-NEXT: | | | | | | NonLabelDoStmt
-!CHECK-PARSE-NEXT: | | | | | | | LoopControl -> LoopBounds
-!CHECK-PARSE-NEXT: | | | | | | | | Scalar -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | Scalar -> Expr = '1_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-PARSE-NEXT: | | | | | | | | Scalar -> Expr = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | Block
-!CHECK-PARSE-NEXT: | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'y(int(i,kind=8))=5_4*y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | | Variable = 'y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | | | Designator -> DataRef -> ArrayElement
-!CHECK-PARSE-NEXT: | | | | | | | | | | DataRef -> Name = 'y'
-!CHECK-PARSE-NEXT: | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | Expr = '5_4*y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | | | Multiply
-!CHECK-PARSE-NEXT: | | | | | | | | | | Expr = 'y(int(i,kind=8))'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | Designator -> DataRef -> ArrayElement
-!CHECK-PARSE-NEXT: | | | | | | | | | | | | DataRef -> Name = 'y'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
-!CHECK-PARSE-NEXT: | | | | | | | | | | Expr = '5_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '5'
-!CHECK-PARSE-NEXT: | | | | | | EndDoStmt ->
+!CHECK-PARSE-NEXT: | | | | | Block
+!CHECK-PARSE-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!CHECK-PARSE-NEXT: | | | | | | | OmpBeginLoopDirective
+!CHECK-PARSE-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
+!CHECK-PARSE-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
+!CHECK-PARSE-NEXT: | | | | | | | | Flags = None
+!CHECK-PARSE-NEXT: | | | | | | | Block
+!CHECK-PARSE-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+!CHECK-PARSE-NEXT: | | | | | | | | | NonLabelDoStmt
+!CHECK-PARSE-NEXT: | | | | | | | | | | LoopControl -> LoopBounds
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Scalar -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Scalar -> Expr = '1_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Scalar -> Expr = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | Block
+!CHECK-PARSE-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'y(int(i,kind=8))=5_4*y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Variable = 'y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | Designator -> DataRef -> ArrayElement
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | DataRef -> Name = 'y'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | Expr = '5_4*y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | Multiply
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | Expr = 'y(int(i,kind=8))'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | Designator -> DataRef -> ArrayElement
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | | DataRef -> Name = 'y'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | | | Designator -> DataRef -> Name = 'i'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | Expr = '5_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '5'
+!CHECK-PARSE-NEXT: | | | | | | | | | EndDoStmt ->
+!CHECK-PARSE-NEXT: | | | | | | | OmpEndLoopDirective
+!CHECK-PARSE-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
+!CHECK-PARSE-NEXT: | | | | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
+!CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
 !CHECK-PARSE-NEXT: | | | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | | OmpEndLoopDirective
-!CHECK-PARSE-NEXT: | | | | | OmpDirectiveName -> llvm::omp::Directive = unroll
-!CHECK-PARSE-NEXT: | | | | | OmpClauseList ->
-!CHECK-PARSE-NEXT: | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | OmpEndLoopDirective
 !CHECK-PARSE-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = do
 !CHECK-PARSE-NEXT: | | | | OmpClauseList ->
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct03.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct03.f90
index 8725025a51321..e431b6d535ff5 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct03.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct03.f90
@@ -26,41 +26,42 @@ subroutine loop_transformation_construct7
 !CHECK-PARSE-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '2'
 !CHECK-PARSE-NEXT: | | | | OmpClause -> Private -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'b'
 !CHECK-PARSE-NEXT: | | | | Flags = None
-!CHECK-PARSE-NEXT: | | | DoConstruct
-!CHECK-PARSE-NEXT: | | | | NonLabelDoStmt
-!CHECK-PARSE-NEXT: | | | | | LoopControl -> LoopBounds
-!CHECK-PARSE-NEXT: | | | | | | Scalar -> Name = 'b'
-!CHECK-PARSE-NEXT: | | | | | | Scalar -> Expr = '1_4'
-!CHECK-PARSE-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-PARSE-NEXT: | | | | | | Scalar -> Expr = '10_4'
-!CHECK-PARSE-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '10'
-!CHECK-PARSE-NEXT: | | | | Block
-!CHECK-PARSE-NEXT: | | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
-!CHECK-PARSE-NEXT: | | | | | | NonLabelDoStmt
-!CHECK-PARSE-NEXT: | | | | | | | LoopControl -> LoopBounds
-!CHECK-PARSE-NEXT: | | | | | | | | Scalar -> Name = 'c'
-!CHECK-PARSE-NEXT: | | | | | | | | Scalar -> Expr = '1_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-PARSE-NEXT: | | | | | | | | Scalar -> Expr = '10_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | LiteralConstant -> IntLiteralConstant = '10'
-!CHECK-PARSE-NEXT: | | | | | | Block
-!CHECK-PARSE-NEXT: | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'a(int(b,kind=8),2_8)=a(int(c,kind=8),1_8)'
-!CHECK-PARSE-NEXT: | | | | | | | | Variable = 'a(int(b,kind=8),2_8)'
-!CHECK-PARSE-NEXT: | | | | | | | | | Designator -> DataRef -> ArrayElement
-!CHECK-PARSE-NEXT: | | | | | | | | | | DataRef -> Name = 'a'
-!CHECK-PARSE-NEXT: | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'b'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | Designator -> DataRef -> Name = 'b'
-!CHECK-PARSE-NEXT: | | | | | | | | | | SectionSubscript -> Integer -> Expr = '2_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK-PARSE-NEXT: | | | | | | | | Expr = 'a(int(c,kind=8),1_8)'
-!CHECK-PARSE-NEXT: | | | | | | | | | Designator -> DataRef -> ArrayElement
-!CHECK-PARSE-NEXT: | | | | | | | | | | DataRef -> Name = 'a'
-!CHECK-PARSE-NEXT: | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'c'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | Designator -> DataRef -> Name = 'c'
-!CHECK-PARSE-NEXT: | | | | | | | | | | SectionSubscript -> Integer -> Expr = '1_4'
-!CHECK-PARSE-NEXT: | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK-PARSE-NEXT: | | | | | | EndDoStmt ->
-!CHECK-PARSE-NEXT: | | | | EndDoStmt ->
+!CHECK-PARSE-NEXT: | | | Block
+!CHECK-PARSE-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+!CHECK-PARSE-NEXT: | | | | | NonLabelDoStmt
+!CHECK-PARSE-NEXT: | | | | | | LoopControl -> LoopBounds
+!CHECK-PARSE-NEXT: | | | | | | | Scalar -> Name = 'b'
+!CHECK-PARSE-NEXT: | | | | | | | Scalar -> Expr = '1_4'
+!CHECK-PARSE-NEXT: | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-PARSE-NEXT: | | | | | | | Scalar -> Expr = '10_4'
+!CHECK-PARSE-NEXT: | | | | | | | | LiteralConstant -> IntLiteralConstant = '10'
+!CHECK-PARSE-NEXT: | | | | | Block
+!CHECK-PARSE-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+!CHECK-PARSE-NEXT: | | | | | | | NonLabelDoStmt
+!CHECK-PARSE-NEXT: | | | | | | | | LoopControl -> LoopBounds
+!CHECK-PARSE-NEXT: | | | | | | | | | Scalar -> Name = 'c'
+!CHECK-PARSE-NEXT: | | | | | | | | | Scalar -> Expr = '1_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-PARSE-NEXT: | | | | | | | | | Scalar -> Expr = '10_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '10'
+!CHECK-PARSE-NEXT: | | | | | | | Block
+!CHECK-PARSE-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'a(int(b,kind=8),2_8)=a(int(c,kind=8),1_8)'
+!CHECK-PARSE-NEXT: | | | | | | | | | Variable = 'a(int(b,kind=8),2_8)'
+!CHECK-PARSE-NEXT: | | | | | | | | | | Designator -> DataRef -> ArrayElement
+!CHECK-PARSE-NEXT: | | | | | | | | | | | DataRef -> Name = 'a'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'b'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | Designator -> DataRef -> Name = 'b'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | SectionSubscript -> Integer -> Expr = '2_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
+!CHECK-PARSE-NEXT: | | | | | | | | | Expr = 'a(int(c,kind=8),1_8)'
+!CHECK-PARSE-NEXT: | | | | | | | | | | Designator -> DataRef -> ArrayElement
+!CHECK-PARSE-NEXT: | | | | | | | | | | | DataRef -> Name = 'a'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | SectionSubscript -> Integer -> Expr = 'c'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | Designator -> DataRef -> Name = 'c'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | SectionSubscript -> Integer -> Expr = '1_4'
+!CHECK-PARSE-NEXT: | | | | | | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-PARSE-NEXT: | | | | | | | EndDoStmt ->
+!CHECK-PARSE-NEXT: | | | | | EndDoStmt ->
 !CHECK-PARSE-NEXT: | EndSubroutineStmt ->
 
 !CHECK-UNPARSE: SUBROUTINE loop_transformation_construct7
diff --git a/flang/test/Parser/OpenMP/transparent-clause.f90 b/flang/test/Parser/OpenMP/transparent-clause.f90
index 8f669546f2dea..3512326b321e6 100644
--- a/flang/test/Parser/OpenMP/transparent-clause.f90
+++ b/flang/test/Parser/OpenMP/transparent-clause.f90
@@ -74,4 +74,5 @@ subroutine f02
 !PARSE-TREE: | | OmpClauseList -> OmpClause -> Transparent -> OmpTransparentClause -> Scalar -> Integer -> Expr = '2_4'
 !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '2'
 !PARSE-TREE: | | Flags = None
-!PARSE-TREE: | DoConstruct
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
diff --git a/flang/test/Parser/OpenMP/unroll-heuristic.f90 b/flang/test/Parser/OpenMP/unroll-heuristic.f90
index bbc2df3b57df6..c181a06b457f3 100644
--- a/flang/test/Parser/OpenMP/unroll-heuristic.f90
+++ b/flang/test/Parser/OpenMP/unroll-heuristic.f90
@@ -23,22 +23,23 @@ END subroutine openmp_parse_unroll_heuristic
 !PTREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !PTREE-NEXT: | | OmpClauseList ->
 !PTREE-NEXT: | | Flags = None
-!PTREE-NEXT: | DoConstruct
-!PTREE-NEXT: | | NonLabelDoStmt
-!PTREE-NEXT: | | | LoopControl -> LoopBounds
-!PTREE-NEXT: | | | | Scalar -> Name = 'i'
-!PTREE-NEXT: | | | | Scalar -> Expr = '1_4'
-!PTREE-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!PTREE-NEXT: | | | | Scalar -> Expr = '100_4'
-!PTREE-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '100'
-!PTREE-NEXT: | | Block
-!PTREE-NEXT: | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL func(i)'
-!PTREE-NEXT: | | | | | | Call
-!PTREE-NEXT: | | | | | ProcedureDesignator -> Name = 'func'
-!PTREE-NEXT: | | | | | ActualArgSpec
-!PTREE-NEXT: | | | | | | ActualArg -> Expr = 'i'
-!PTREE-NEXT: | | | | | | | Designator -> DataRef -> Name = 'i'
-!PTREE-NEXT: | | EndDoStmt ->
+!PTREE-NEXT: | Block
+!PTREE-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+!PTREE-NEXT: | | | NonLabelDoStmt
+!PTREE-NEXT: | | | | LoopControl -> LoopBounds
+!PTREE-NEXT: | | | | | Scalar -> Name = 'i'
+!PTREE-NEXT: | | | | | Scalar -> Expr = '1_4'
+!PTREE-NEXT: | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!PTREE-NEXT: | | | | | Scalar -> Expr = '100_4'
+!PTREE-NEXT: | | | | | | LiteralConstant -> IntLiteralConstant = '100'
+!PTREE-NEXT: | | | Block
+!PTREE-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL func(i)'
+!PTREE-NEXT: | | | | | | | Call
+!PTREE-NEXT: | | | | | | ProcedureDesignator -> Name = 'func'
+!PTREE-NEXT: | | | | | | ActualArgSpec
+!PTREE-NEXT: | | | | | | | ActualArg -> Expr = 'i'
+!PTREE-NEXT: | | | | | | | | Designator -> DataRef -> Name = 'i'
+!PTREE-NEXT: | | | EndDoStmt ->
 !PTREE-NEXT: | OmpEndLoopDirective
 !PTREE-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = unroll
 !PTREE-NEXT: | | OmpClauseList ->
diff --git a/flang/test/Semantics/OpenMP/simd-only.f90 b/flang/test/Semantics/OpenMP/simd-only.f90
index e137ef7d82929..4e29329e15cac 100644
--- a/flang/test/Semantics/OpenMP/simd-only.f90
+++ b/flang/test/Semantics/OpenMP/simd-only.f90
@@ -10,7 +10,7 @@ subroutine test_simd()
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
   ! CHECK: OmpDirectiveName -> llvm::omp::Directive = simd
-  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp simd
   do i = 1, 100
   end do
@@ -22,7 +22,7 @@ subroutine test_do_simd()
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
   ! CHECK: OmpDirectiveName -> llvm::omp::Directive = do simd
-  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp do simd
   do i = 1, 100
   end do
@@ -35,7 +35,7 @@ subroutine test_parallel_do_simd()
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
   ! CHECK: OmpDirectiveName -> llvm::omp::Directive = parallel do simd
-  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp parallel do simd
   do i = 1, 100
   end do
@@ -65,7 +65,7 @@ subroutine test_simd_atomic()
 
   ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
   ! CHECK: OmpDirectiveName -> llvm::omp::Directive = simd
-  ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+  ! CHECK: ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
   !$omp simd
   do i = 1, 100
   ! CHECK-NOT: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct

From 29e7b4f9a72576a2901407834b988ec37f931d28 Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Mon, 17 Nov 2025 07:35:27 -0700
Subject: [PATCH 016/105] [flang][NFC] Strip trailing whitespace from tests (5
 of N)

Only the fortran source files in flang/test/Lower/OpenACC have been
modified. The other files in flang/test will be cleaned up in subsequent
commits
---
 .../OpenACC/Todo/do-loops-to-acc-loops-todo.f90    |  6 +++---
 flang/test/Lower/OpenACC/acc-atomic-capture.f90    | 10 +++++-----
 .../test/Lower/OpenACC/acc-atomic-update-array.f90 |  6 +++---
 flang/test/Lower/OpenACC/acc-atomic-update.f90     |  6 +++---
 flang/test/Lower/OpenACC/acc-bounds.f90            |  2 +-
 flang/test/Lower/OpenACC/acc-host-data.f90         |  2 +-
 flang/test/Lower/OpenACC/acc-loop-exit.f90         |  8 ++++----
 flang/test/Lower/OpenACC/acc-loop.f90              |  4 ++--
 flang/test/Lower/OpenACC/acc-private.f90           |  8 ++++----
 flang/test/Lower/OpenACC/acc-routine-named.f90     |  2 +-
 flang/test/Lower/OpenACC/acc-routine.f90           |  6 +++---
 flang/test/Lower/OpenACC/acc-routine02.f90         |  2 +-
 flang/test/Lower/OpenACC/acc-routine03.f90         |  2 +-
 flang/test/Lower/OpenACC/acc-routine04.f90         |  8 ++++----
 flang/test/Lower/OpenACC/acc-shutdown.f90          |  4 ++--
 flang/test/Lower/OpenACC/acc-terminator.f90        |  2 +-
 flang/test/Lower/OpenACC/acc-use-device.f90        |  4 ++--
 flang/test/Lower/OpenACC/locations.f90             | 14 ++++++--------
 18 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
index aa1d44365e5eb..3f2b77a9a1484 100644
--- a/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
+++ b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
@@ -72,9 +72,9 @@ subroutine nested_loop_with_inner_goto()
   integer :: ii = 0, jj = 0
   integer, parameter :: nn = 3
   real, dimension(nn, nn) :: aa
-  
+
   aa = -1
-  
+
   ! Nested loop with goto from inner loop - unstructured control flow is not converted.
   !$acc kernels
   do ii = 1, nn
@@ -88,4 +88,4 @@ subroutine nested_loop_with_inner_goto()
 
 ! CHECK4: not yet implemented: unstructured do loop in acc kernels
 
-end subroutine
\ No newline at end of file
+end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-atomic-capture.f90 b/flang/test/Lower/OpenACC/acc-atomic-capture.f90
index 30e60e34b13a2..ccdd4d3014e94 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-capture.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-capture.f90
@@ -36,7 +36,7 @@ program acc_atomic_capture_test
 !CHECK: }
 
     !$acc atomic capture
-        y = x * y 
+        y = x * y
         x = y
     !$acc end atomic
 
@@ -53,8 +53,8 @@ program acc_atomic_capture_test
 
     !$acc atomic capture
         x = y
-        y = 2 * 10 + (8 - x) 
-    !$acc end atomic 
+        y = 2 * 10 + (8 - x)
+    !$acc end atomic
 end program
 
 
@@ -123,8 +123,8 @@ subroutine capture_with_convert_f32_to_i32()
 ! CHECK: }
 
 subroutine capture_with_convert_i32_to_f64()
-  real(8) :: x 
-  integer :: v, u 
+  real(8) :: x
+  integer :: v, u
   x = 1.0
   v = 0
   u = 1
diff --git a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
index 184c2a6fb0aeb..b04b0c9d5a143 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
@@ -5,7 +5,7 @@ subroutine atomic_update_array1(r, n, x)
   integer :: n
   real :: r(n), x
   integer :: i
-   
+
   !$acc data copy(r)
 
   !$acc parallel loop
@@ -51,7 +51,7 @@ subroutine atomic_write_array1(r, n, x)
   implicit none
   integer :: n
   real :: r(n), x
-  
+
   !$acc atomic write
   x = r(n)
 end subroutine
@@ -61,7 +61,7 @@ subroutine atomic_write_array1(r, n, x)
 ! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
-! CHECK: %[[LOAD:.*]] = fir.load %[[DES]] : !fir.ref<f32> 
+! CHECK: %[[LOAD:.*]] = fir.load %[[DES]] : !fir.ref<f32>
 ! CHECK: acc.atomic.write %[[DECL_X]]#0 = %[[LOAD]] : !fir.ref<f32>, f32
 
 subroutine atomic_capture_array1(r, n, x, y)
diff --git a/flang/test/Lower/OpenACC/acc-atomic-update.f90 b/flang/test/Lower/OpenACC/acc-atomic-update.f90
index 71aa69fd64eba..f4c305a332640 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-update.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-update.f90
@@ -42,7 +42,7 @@ end function func
 !CHECK: }
 
     !$acc atomic update
-        a = a + b 
+        a = a + b
 
 !CHECK: {{.*}} = arith.constant 1 : i32
 !CHECK: acc.atomic.update %[[Y_DECL]]#0 : !fir.ref<i32> {
@@ -56,10 +56,10 @@ end function func
 !CHECK:    %[[RESULT:.*]] = arith.muli %[[LOADED_X]], %[[ARG]] : i32
 !CHECK:    acc.yield %[[RESULT]] : i32
 !CHECK:  }
-    !$acc atomic 
+    !$acc atomic
         y = y + 1
     !$acc atomic update
-        z = x * z 
+        z = x * z
 
 !CHECK:  %[[C1_VAL:.*]] = arith.constant 1 : i32
 !CHECK:  acc.atomic.update %[[I1_DECL]]#0 : !fir.ref<i8> {
diff --git a/flang/test/Lower/OpenACC/acc-bounds.f90 b/flang/test/Lower/OpenACC/acc-bounds.f90
index 44ca2514f6eea..03779ac0cfe51 100644
--- a/flang/test/Lower/OpenACC/acc-bounds.f90
+++ b/flang/test/Lower/OpenACC/acc-bounds.f90
@@ -114,7 +114,7 @@ subroutine acc_optional_data(a)
     !$acc data attach(a)
     !$acc end data
   end subroutine
-  
+
 ! CHECK-LABEL: func.func @_QMopenacc_boundsPacc_optional_data(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "a", fir.optional}) {
 ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QMopenacc_boundsFacc_optional_dataEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
diff --git a/flang/test/Lower/OpenACC/acc-host-data.f90 b/flang/test/Lower/OpenACC/acc-host-data.f90
index 4d09b25b983b9..2cf8060bcf8d1 100644
--- a/flang/test/Lower/OpenACC/acc-host-data.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data.f90
@@ -26,7 +26,7 @@ subroutine acc_host_data()
 ! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK: } attributes {ifPresent}
 
-  !$acc host_data use_device(a) if_present 
+  !$acc host_data use_device(a) if_present
   !$acc end host_data
 ! CHECK: acc.host_data dataOperands(%{{.*}}{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
diff --git a/flang/test/Lower/OpenACC/acc-loop-exit.f90 b/flang/test/Lower/OpenACC/acc-loop-exit.f90
index 0b35a86c41b2e..6ab215fdbd842 100644
--- a/flang/test/Lower/OpenACC/acc-loop-exit.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-exit.f90
@@ -11,7 +11,7 @@ subroutine sub1(x, a)
   end do
 
   i = 2
-end 
+end
 
 ! CHECK-LABEL: func.func @_QPsub1
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 dummy_scope %{{[0-9]+}} arg {{[0-9]+}} {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -20,9 +20,9 @@ subroutine sub1(x, a)
 ! CHECK:   %[[I:.*]]:2 = hlfir.declare %{{[0-9]+}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: ^bb{{.*}}:
 ! CHECK: ^bb{{.*}}:
-! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32> 
-! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32> 
-! CHECK:   %[[LOAD_A:.*]] = fir.load %[[A]]#0 : !fir.ref<i32> 
+! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+! CHECK:   %[[LOAD_A:.*]] = fir.load %[[A]]#0 : !fir.ref<i32>
 ! CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[LOAD_I]], %[[LOAD_A]] : i32
 ! CHECK:   cf.cond_br %[[CMP]], ^[[EARLY_RET:.*]], ^[[NO_RET:.*]]
 ! CHECK: ^[[EARLY_RET]]:
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index f9f5e8c2165d5..b3fadbc8b388b 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -376,8 +376,8 @@ subroutine sub1(i, j, k)
 ! CHECK-SAME: %[[ARG_J:.*]]: !fir.ref<i32> {fir.bindc_name = "j"}
 ! CHECK-SAME: %[[ARG_K:.*]]: !fir.ref<i32> {fir.bindc_name = "k"}
 ! CHECK: %[[DC_I:.*]]:2 = hlfir.declare %[[ARG_I]] dummy_scope %0
-! CHECK: %[[DC_J:.*]]:2 = hlfir.declare %[[ARG_J]] dummy_scope %0 
-! CHECK: %[[DC_K:.*]]:2 = hlfir.declare %[[ARG_K]] dummy_scope %0 
+! CHECK: %[[DC_J:.*]]:2 = hlfir.declare %[[ARG_J]] dummy_scope %0
+! CHECK: %[[DC_K:.*]]:2 = hlfir.declare %[[ARG_K]] dummy_scope %0
 ! CHECK: acc.parallel combined(loop)
 ! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
 ! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]]#0 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 10d103c84f8de..ea12da7a5a99f 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -195,8 +195,8 @@
 ! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_i32 : !fir.ref<i32> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) 
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32> 
+! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: } copy {
 ! CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>):
 ! CHECK:   %[[VALUE:.*]] = fir.load %[[SRC]] : !fir.ref<i32>
@@ -223,8 +223,8 @@
 ! CHECK-LABEL: acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
 ! CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
 ! CHECK:   %[[ALLOCA:.*]] = fir.alloca i32
-! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) 
-! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32> 
+! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<i32>
 ! CHECK: }
 
 program acc_private
diff --git a/flang/test/Lower/OpenACC/acc-routine-named.f90 b/flang/test/Lower/OpenACC/acc-routine-named.f90
index de9784a1146cc..24d47e58b6e1b 100644
--- a/flang/test/Lower/OpenACC/acc-routine-named.f90
+++ b/flang/test/Lower/OpenACC/acc-routine-named.f90
@@ -14,7 +14,7 @@ module acc_routines
   subroutine acc1()
   end subroutine
 
-! CHECK-LABEL: func.func @_QMacc_routinesPacc1() 
+! CHECK-LABEL: func.func @_QMacc_routinesPacc1()
 ! CHECK-SAME:attributes {acc.routine_info = #acc.routine_info<[@[[r1]]]>}
 
   subroutine acc2()
diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90
index 1a63b4120235c..c281ca5dfc287 100644
--- a/flang/test/Lower/OpenACC/acc-routine.f90
+++ b/flang/test/Lower/OpenACC/acc-routine.f90
@@ -127,13 +127,13 @@ subroutine acc_routine16()
 end subroutine
 
 subroutine acc_routine17()
-  !$acc routine device_type(host) worker dtype(multicore) vector 
+  !$acc routine device_type(host) worker dtype(multicore) vector
 end subroutine
 
 subroutine acc_routine18()
-  !$acc routine device_type(host) bind(acc_routine17) dtype(multicore) bind(acc_routine16) 
+  !$acc routine device_type(host) bind(acc_routine17) dtype(multicore) bind(acc_routine16)
 end subroutine
 
 subroutine acc_routine19()
-  !$acc routine device_type(host,default) bind(acc_routine17) dtype(multicore) bind(acc_routine16) 
+  !$acc routine device_type(host,default) bind(acc_routine17) dtype(multicore) bind(acc_routine16)
 end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-routine02.f90 b/flang/test/Lower/OpenACC/acc-routine02.f90
index 1c15cb409e634..dd07cba4b20e3 100644
--- a/flang/test/Lower/OpenACC/acc-routine02.f90
+++ b/flang/test/Lower/OpenACC/acc-routine02.f90
@@ -17,4 +17,4 @@ program test
 
 ! CHECK-LABEL: acc.routine @acc_routine_0 func(@_QPsub1)
 
-! CHECK: func.func @_QPsub1(%ar{{.*}}: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} 
+! CHECK: func.func @_QPsub1(%ar{{.*}}: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "a"}, %arg1: !fir.ref<i32> {fir.bindc_name = "n"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
diff --git a/flang/test/Lower/OpenACC/acc-routine03.f90 b/flang/test/Lower/OpenACC/acc-routine03.f90
index ddd6bda0367e4..3fc307746849f 100644
--- a/flang/test/Lower/OpenACC/acc-routine03.f90
+++ b/flang/test/Lower/OpenACC/acc-routine03.f90
@@ -20,7 +20,7 @@ subroutine sub1(a)
     !$acc routine worker bind(sub2)
     real :: a(:)
   end subroutine
- 
+
   subroutine sub2(a)
     !$acc routine worker nohost
     real :: a(:)
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index 655e2762b9694..470440728d2f5 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -14,17 +14,17 @@ subroutine sub1(i)
 
 program test_acc_routine
   use dummy_mod
-  
+
   !$acc routine(sub2) seq
-  
+
   implicit none
-  
+
   integer :: i
 
 contains
   subroutine sub2()
   end subroutine
-  
+
 end program
 
 ! CHECK: acc.routine @acc_routine_1 func(@_QFPsub2) seq
diff --git a/flang/test/Lower/OpenACC/acc-shutdown.f90 b/flang/test/Lower/OpenACC/acc-shutdown.f90
index 304dd4fae6db5..de6191d7f0cd2 100644
--- a/flang/test/Lower/OpenACC/acc-shutdown.f90
+++ b/flang/test/Lower/OpenACC/acc-shutdown.f90
@@ -23,9 +23,9 @@ subroutine acc_shutdown
 
   !$acc shutdown device_num(1) device_type(default, nvidia)
 !CHECK: [[DEVNUM:%.*]] = arith.constant 1 : i32
-!CHECK: acc.shutdown device_num([[DEVNUM]] : i32) attributes {device_types = [#acc.device_type<default>, #acc.device_type<nvidia>]} 
+!CHECK: acc.shutdown device_num([[DEVNUM]] : i32) attributes {device_types = [#acc.device_type<default>, #acc.device_type<nvidia>]}
 
   !$acc shutdown device_type(default) device_type(nvidia)
-!CHECK: acc.shutdown attributes {device_types = [#acc.device_type<default>, #acc.device_type<nvidia>]} 
+!CHECK: acc.shutdown attributes {device_types = [#acc.device_type<default>, #acc.device_type<nvidia>]}
 
 end subroutine acc_shutdown
diff --git a/flang/test/Lower/OpenACC/acc-terminator.f90 b/flang/test/Lower/OpenACC/acc-terminator.f90
index 53ae1a5e54675..16100f9e36cd5 100644
--- a/flang/test/Lower/OpenACC/acc-terminator.f90
+++ b/flang/test/Lower/OpenACC/acc-terminator.f90
@@ -17,7 +17,7 @@ program main
     !$acc data copyin(a(:,:,i),b(:,:,i),c(:,:,i)) copyout(c2(:,:,i))
 
     !$acc host_data use_device(a(:,:,i),b(:,:,i),c(:,:,i))
-    
+
     !$acc end host_data
 
     if ( stat .ne. 0 ) then
diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90
index 30fefdb44a2bf..4f9ed2d70b3ec 100644
--- a/flang/test/Lower/OpenACC/acc-use-device.f90
+++ b/flang/test/Lower/OpenACC/acc-use-device.f90
@@ -9,7 +9,7 @@ subroutine test()
 ! CHECK: %[[A0:.*]] = fir.alloca !fir.array<?xf64>, %{{.*}} {bindc_name = "b", uniq_name = "_QFtestEb"}
 ! CHECK: %[[A1:.*]] = fir.shape_shift {{.*}} : (index, index) -> !fir.shapeshift<1>
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A0]](%[[A1]]) {uniq_name = "_QFtestEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
-  
+
   !$acc data copy(b)
 ! CHECK: %[[B:.*]] = acc.copyin var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK: acc.data dataOperands(%[[B]] : !fir.box<!fir.array<?xf64>>) {
@@ -23,7 +23,7 @@ subroutine test()
 ! CHECK: fir.call @_QPvadd(%[[A]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> ()
   !$acc end data
 ! CHECK: acc.copyout accVar(%[[B]] : !fir.box<!fir.array<?xf64>>) to var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"}
-end 
+end
 
 ! Test for allocatable, pointer and assumed-shape variables appearing in use_device clause.
 subroutine test2(a, b, c)
diff --git a/flang/test/Lower/OpenACC/locations.f90 b/flang/test/Lower/OpenACC/locations.f90
index 69873b3fbca4f..8e00721ba2b4d 100644
--- a/flang/test/Lower/OpenACC/locations.f90
+++ b/flang/test/Lower/OpenACC/locations.f90
@@ -114,7 +114,7 @@ subroutine if_clause_expr_location(arr)
   subroutine atomic_read_loc()
     integer(4) :: x
     integer(8) :: y
-  
+
     !$acc atomic read
     y = x
   end
@@ -123,10 +123,10 @@ subroutine atomic_read_loc()
   subroutine atomic_capture_loc()
     implicit none
     integer :: k, v, i
-  
+
     k = 1
     v = 0
-  
+
     !$acc atomic capture
     v = k
     k = (i + 1) * 3.14
@@ -142,13 +142,13 @@ subroutine atomic_capture_loc()
   subroutine atomic_update_loc()
     implicit none
     integer :: x, y, z
-    
-    !$acc atomic 
+
+    !$acc atomic
     y = y + 1
 ! CHECK: acc.atomic.update %{{.*}} : !fir.ref<i32> {
 ! CHECK: ^bb0(%{{.*}}: i32 loc("{{.*}}locations.f90":142:3)):
 ! CHECK: } loc("{{.*}}locations.f90":142:3)
-    
+
     !$acc atomic update
     z = x * z
   end subroutine
@@ -183,5 +183,3 @@ subroutine data_end_locations(arr)
     !CHECK-SAME:  loc("{{.*}}locations.f90":181:11)
   end subroutine
 end module
-
-

From b6fd3c62bb8af8b9c79d32207d96e3674aeccb36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= <mikolaj.maciej.pirog@intel.com>
Date: Mon, 17 Nov 2025 15:46:58 +0100
Subject: [PATCH 017/105] [X86] Enable APX and AVX10.2 on NVL (#168061)

Per Intel Architecture Instruction Set Extensions Programming Reference
rev. 60 (https://cdrdv2.intel.com/v1/dl/getContent/671368), table 1-2,
NVL supports APX and AVX10.2
---
 .../Preprocessor/predefined-arch-macros.c     | 68 ++++++++++++++++---
 llvm/lib/Target/X86/X86.td                    | 12 +++-
 llvm/lib/TargetParser/X86TargetParser.cpp     |  4 +-
 3 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index cf2cd4a10b056..27feeb57b5de2 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2525,15 +2525,32 @@
 // RUN: %clang -march=wildcatlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NKL_M32
-// RUN: %clang -march=novalake -m32 -E -dM %s -o - 2>&1 \
-// RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NVL_M32,CHECK_NKL_M32
 // RUN: %clang -march=clearwaterforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_NVL_M32,CHECK_UMSR_M32,CHECK_NKL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_UMSR_M32,CHECK_CWF_M32,CHECK_NKL_M32
+// RUN: %clang -march=novalake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_CWF_M32,CHECK_NVL_M32,CHECK_NKL_M32
 // CHECK_ARL_M32: #define __ADX__ 1
 // CHECK_ARL_M32: #define __AES__ 1
+// CHECK_NVL_M32: #define __AVX10_1__ 1
+// CHECK_NVL_M32: #define __AVX10_2__ 1
 // CHECK_ARL_M32: #define __AVX2__ 1
+// CHECK_NVL_M32: #define __AVX512BF16__ 1
+// CHECK_NVL_M32: #define __AVX512BITALG__ 1
+// CHECK_NVL_M32: #define __AVX512BW__ 1
+// CHECK_NVL_M32: #define __AVX512CD__ 1
+// CHECK_NVL_M32: #define __AVX512DQ__ 1
+// CHECK_NVL_M32: #define __AVX512FP16__ 1
+// CHECK_NVL_M32: #define __AVX512F__ 1
+// CHECK_NVL_M32: #define __AVX512IFMA__ 1
+// CHECK_NVL_M32: #define __AVX512VBMI2__ 1
+// CHECK_NVL_M32: #define __AVX512VBMI__ 1
+// CHECK_NVL_M32: #define __AVX512VL__ 1
+// CHECK_NVL_M32: #define __AVX512VNNI__ 1
+// CHECK_NVL_M32: #define __AVX512VPOPCNTDQ__ 1
+// We check for NOT AVX512 after all checks for AVX512, so 
+// if we missed some check on NVL, the test will fail.
 // CHECK_ARL_M32-NOT: AVX512
 // CHECK_ARL_M32: #define __AVXIFMA__ 1
 // CHECK_ARL_M32: #define __AVXNECONVERT__ 1
@@ -2544,11 +2561,13 @@
 // CHECK_ARL_M32: #define __AVX__ 1
 // CHECK_ARL_M32: #define __BMI2__ 1
 // CHECK_ARL_M32: #define __BMI__ 1
+// CHECK_NVL_M32: #define __CCMP__ 1
 // CHECK_ARLS_M32-NOT: __CLDEMOTE__
 // CHECK_SRF_M32: #define __CLDEMOTE__ 1
 // CHECK_ARL_M32: #define __CLFLUSHOPT__ 1
 // CHECK_ARL_M32: #define __CLWB__ 1
 // CHECK_ARL_M32: #define __CMPCCXADD__ 1
+// CHECK_NVL_M32: #define __EGPR__ 1
 // CHECK_ARL_M32: #define __ENQCMD__ 1
 // CHECK_ARL_M32: #define __F16C__ 1
 // CHECK_ARL_M32: #define __FMA__ 1
@@ -2564,15 +2583,20 @@
 // CHECK_ARL_M32: #define __MOVBE__ 1
 // CHECK_ARL_M32: #define __MOVDIR64B__ 1
 // CHECK_ARL_M32: #define __MOVDIRI__ 1
+// CHECK_NVL_M32: #define __MOVRS__ 1
+// CHECK_NVL_M32: #define __NDD__ 1
+// CHECK_NVL_M32: #define __NF__ 1
 // CHECK_ARL_M32: #define __PCLMUL__ 1
 // CHECK_ARL_M32: #define __PCONFIG__ 1
 // CHECK_ARL_M32: #define __PKU__ 1
 // CHECK_ARL_M32: #define __POPCNT__ 1
+// CHECK_NVL_M32: #define __PPX__ 1
 // CHECK_ARL_M32-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M32-NOT: #define __PREFETCHI__ 1
-// CHECK_NVL_M32: #define __PREFETCHI__ 1
+// CHECK_CWF_M32: #define __PREFETCHI__ 1
 // CHECK_ARL_M32: #define __PRFCHW__ 1
 // CHECK_ARL_M32: #define __PTWRITE__ 1
+// CHECK_NVL_M32: #define __PUSH2POP2__ 1
 // CHECK_ARL_M32-NOT: #define __RAOINT__ 1
 // CHECK_ARL_M32: #define __RDPID__ 1
 // CHECK_ARL_M32: #define __RDRND__ 1
@@ -2607,6 +2631,7 @@
 // CHECK_ARL_M32: #define __XSAVEOPT__ 1
 // CHECK_ARL_M32: #define __XSAVES__ 1
 // CHECK_ARL_M32: #define __XSAVE__ 1
+// CHECK_NVL_M32: #define __ZU__ 1
 // CHECK_ARL_M32: #define __corei7 1
 // CHECK_ARL_M32: #define __corei7__ 1
 // CHECK_ARL_M32: #define __i386 1
@@ -2635,15 +2660,30 @@
 // RUN: %clang -march=wildcatlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NKL_M64
-// RUN: %clang -march=novalake -m64 -E -dM %s -o - 2>&1 \
-// RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NVL_M64,CHECK_NKL_M64
 // RUN: %clang -march=clearwaterforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_NVL_M64,CHECK_UMSR_M64,CHECK_NKL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_UMSR_M64,CHECK_CWF_M64,CHECK_NKL_M64
+// RUN: %clang -march=novalake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_CWF_M64,CHECK_NVL_M64,CHECK_NKL_M64
 // CHECK_ARL_M64: #define __ADX__ 1
 // CHECK_ARL_M64: #define __AES__ 1
+// CHECK_NVL_M64: #define __AVX10_1__ 1
+// CHECK_NVL_M64: #define __AVX10_2__ 1
 // CHECK_ARL_M64: #define __AVX2__ 1
+// CHECK_NVL_M64: #define __AVX512BF16__ 1
+// CHECK_NVL_M64: #define __AVX512BITALG__ 1
+// CHECK_NVL_M64: #define __AVX512BW__ 1
+// CHECK_NVL_M64: #define __AVX512CD__ 1
+// CHECK_NVL_M64: #define __AVX512DQ__ 1
+// CHECK_NVL_M64: #define __AVX512FP16__ 1
+// CHECK_NVL_M64: #define __AVX512F__ 1
+// CHECK_NVL_M64: #define __AVX512IFMA__ 1
+// CHECK_NVL_M64: #define __AVX512VBMI2__ 1
+// CHECK_NVL_M64: #define __AVX512VBMI__ 1
+// CHECK_NVL_M64: #define __AVX512VL__ 1
+// CHECK_NVL_M64: #define __AVX512VNNI__ 1
+// CHECK_NVL_M64: #define __AVX512VPOPCNTDQ__ 1
 // CHECK_ARL_M64-NOT: AVX512
 // CHECK_ARL_M64: #define __AVXIFMA__ 1
 // CHECK_ARL_M64: #define __AVXNECONVERT__ 1
@@ -2654,11 +2694,13 @@
 // CHECK_ARL_M64: #define __AVX__ 1
 // CHECK_ARL_M64: #define __BMI2__ 1
 // CHECK_ARL_M64: #define __BMI__ 1
+// CHECK_NVL_M64: #define __CCMP__ 1
 // CHECK_ARLS_M64-NOT: __CLDEMOTE__
 // CHECK_SRF_M64: #define __CLDEMOTE__ 1
 // CHECK_ARL_M64: #define __CLFLUSHOPT__ 1
 // CHECK_ARL_M64: #define __CLWB__ 1
 // CHECK_ARL_M64: #define __CMPCCXADD__ 1
+// CHECK_NVL_M64: #define __EGPR__ 1
 // CHECK_ARL_M64: #define __ENQCMD__ 1
 // CHECK_ARL_M64: #define __F16C__ 1
 // CHECK_ARL_M64: #define __FMA__ 1
@@ -2674,15 +2716,20 @@
 // CHECK_ARL_M64: #define __MOVBE__ 1
 // CHECK_ARL_M64: #define __MOVDIR64B__ 1
 // CHECK_ARL_M64: #define __MOVDIRI__ 1
+// CHECK_NVL_M64: #define __MOVRS__ 1
+// CHECK_NVL_M64: #define __NDD__ 1
+// CHECK_NVL_M64: #define __NF__ 1
 // CHECK_ARL_M64: #define __PCLMUL__ 1
 // CHECK_ARL_M64: #define __PCONFIG__ 1
 // CHECK_ARL_M64: #define __PKU__ 1
 // CHECK_ARL_M64: #define __POPCNT__ 1
+// CHECK_NVL_M64: #define __PPX__ 1
 // CHECK_ARL_M64-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M64-NOT: #define __PREFETCHI__ 1
-// CHECK_NVL_M64: #define __PREFETCHI__ 1
+// CHECK_CWF_M64: #define __PREFETCHI__ 1
 // CHECK_ARL_M64: #define __PRFCHW__ 1
 // CHECK_ARL_M64: #define __PTWRITE__ 1
+// CHECK_NVL_M64: #define __PUSH2POP2__ 1
 // CHECK_ARL_M64-NOT: #define __RAOINT__ 1
 // CHECK_ARL_M64: #define __RDPID__ 1
 // CHECK_ARL_M64: #define __RDRND__ 1
@@ -2718,6 +2765,7 @@
 // CHECK_ARL_M64: #define __XSAVEOPT__ 1
 // CHECK_ARL_M64: #define __XSAVES__ 1
 // CHECK_ARL_M64: #define __XSAVE__ 1
+// CHECK_NVL_M64: #define __ZU__ 1
 // CHECK_ARL_M64: #define __amd64 1
 // CHECK_ARL_M64: #define __amd64__ 1
 // CHECK_ARL_M64: #define __corei7 1
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e291a6ae431f..27ec052cfda40 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1334,8 +1334,18 @@ def ProcessorFeatures {
     !listremove(ARLSFeatures, [FeatureWIDEKL]);
 
   // Novalake
+  list<SubtargetFeature> NVLAdditionalFeatures = [FeatureAVX10_2,
+                                                  FeatureMOVRS,
+                                                  FeatureEGPR,
+                                                  FeaturePush2Pop2,
+                                                  FeaturePPX,
+                                                  FeatureNF,
+                                                  FeatureNDD,
+                                                  FeatureZU,
+                                                  FeatureCCMP,
+                                                  FeaturePREFETCHI];
   list<SubtargetFeature> NVLFeatures =
-      !listconcat(PTLFeatures, [FeaturePREFETCHI]);
+      !listconcat(PTLFeatures, NVLAdditionalFeatures);
 
   // Clearwaterforest
   list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 293cc42ab81c1..02c33b0af2e2f 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -176,7 +176,9 @@ constexpr FeatureBitset FeaturesArrowlakeS =
 constexpr FeatureBitset FeaturesPantherlake =
     (FeaturesArrowlakeS ^ FeatureWIDEKL);
 constexpr FeatureBitset FeaturesNovalake =
-    FeaturesPantherlake | FeaturePREFETCHI;
+    FeaturesPantherlake | FeaturePREFETCHI | FeatureAVX10_2 | FeatureMOVRS |
+    FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | FeaturePPX |
+    FeatureNDD | FeatureNF;
 constexpr FeatureBitset FeaturesClearwaterforest =
     (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
     FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;

From 6eab083e2dc21be8ef18fc2c58f762938f949799 Mon Sep 17 00:00:00 2001
From: Gil Rapaport <gil.rapaport@mobileye.com>
Date: Mon, 17 Nov 2025 16:49:40 +0200
Subject: [PATCH 018/105] [mlir][emitc] Refactor brackets in expressions
 (#168267)

This patch is a minor NFC-intended refactoring to the way emitting
redundant parentheses is prevented.
The current implementation pushes and later pops a fake low precedence
into the precedence stack when emitting function calls. The new
implementation adds a boolean argument to `emitOperand()` that explicity
guarantees that the operand is being emitted between some kind of
brackets, exempting the method from enforcing correct evaluation order
w.r.t precedence and associativity up the expression tree.
---
 mlir/lib/Target/Cpp/TranslateToCpp.cpp | 31 +++++++++++++-------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 6bd76bb1ffc4b..56f81b0bea9e2 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -173,8 +173,11 @@ struct CppEmitter {
   /// Emits the operands of the operation. All operands are emitted in order.
   LogicalResult emitOperands(Operation &op);
 
-  /// Emits value as an operands of an operation
-  LogicalResult emitOperand(Value value);
+  /// Emits value as an operand of some operation. Unless \p isInBrackets is
+  /// true, operands emitted as sub-expressions will be parenthesized if needed
+  /// in order to enforce correct evaluation based on precedence and
+  /// associativity.
+  LogicalResult emitOperand(Value value, bool isInBrackets = false);
 
   /// Emit an expression as a C expression.
   LogicalResult emitExpression(ExpressionOp expressionOp);
@@ -1578,7 +1581,7 @@ LogicalResult CppEmitter::emitExpression(ExpressionOp expressionOp) {
   return success();
 }
 
-LogicalResult CppEmitter::emitOperand(Value value) {
+LogicalResult CppEmitter::emitOperand(Value value, bool isInBrackets) {
   if (isPartOfCurrentExpression(value)) {
     Operation *def = value.getDefiningOp();
     assert(def && "Expected operand to be defined by an operation");
@@ -1586,10 +1589,12 @@ LogicalResult CppEmitter::emitOperand(Value value) {
     if (failed(precedence))
       return failure();
 
-    // Sub-expressions with equal or lower precedence need to be parenthesized,
-    // as they might be evaluated in the wrong order depending on the shape of
-    // the expression tree.
-    bool encloseInParenthesis = precedence.value() <= getExpressionPrecedence();
+    // Unless already in brackets, sub-expressions with equal or lower
+    // precedence need to be parenthesized as they might be evaluated in the
+    // wrong order depending on the shape of the expression tree.
+    bool encloseInParenthesis =
+        !isInBrackets && precedence.value() <= getExpressionPrecedence();
+
     if (encloseInParenthesis)
       os << "(";
     pushExpressionPrecedence(precedence.value());
@@ -1628,15 +1633,9 @@ LogicalResult CppEmitter::emitOperand(Value value) {
 
 LogicalResult CppEmitter::emitOperands(Operation &op) {
   return interleaveCommaWithError(op.getOperands(), os, [&](Value operand) {
-    // If an expression is being emitted, push lowest precedence as these
-    // operands are either wrapped by parenthesis.
-    if (getEmittedExpression())
-      pushExpressionPrecedence(lowestPrecedence());
-    if (failed(emitOperand(operand)))
-      return failure();
-    if (getEmittedExpression())
-      popExpressionPrecedence();
-    return success();
+    // Emit operand under guarantee that if it's part of an expression then it
+    // is being emitted within brackets.
+    return emitOperand(operand, /*isInBrackets=*/true);
   });
 }
 

From 9fe0a70579ee9e5477a77695822e026c67026ef3 Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813@gmail.com>
Date: Mon, 17 Nov 2025 22:52:53 +0800
Subject: [PATCH 019/105] [llvm][RISCV] Support splat and vp_splat for zvfbfa
 codegen (#167920)

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  18 +-
 .../RISCV/rvv/fixed-vectors-fp-splat-bf16.ll  |  46 +++
 .../RISCV/rvv/fixed-vectors-vp-splat.ll       | 320 ++++++++++++++----
 llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll   |  25 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |  88 ++---
 llvm/test/CodeGen/RISCV/rvv/vp-splat.ll       |  80 +++++
 llvm/test/CodeGen/RISCV/rvv/vsplats-bf16.ll   |  23 ++
 7 files changed, 449 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ff71be9a6bcb4..f313d3f1347d4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -88,9 +88,10 @@ static cl::opt<bool>
                       cl::init(true));
 
 // TODO: Support more ops
-static const unsigned ZvfbfaVPOps[] = {ISD::VP_FNEG, ISD::VP_FABS,
-                                       ISD::VP_FCOPYSIGN};
-static const unsigned ZvfbfaOps[] = {ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN};
+static const unsigned ZvfbfaVPOps[] = {
+    ISD::VP_FNEG, ISD::VP_FABS, ISD::VP_FCOPYSIGN, ISD::EXPERIMENTAL_VP_SPLAT};
+static const unsigned ZvfbfaOps[] = {ISD::FNEG, ISD::FABS, ISD::FCOPYSIGN,
+                                     ISD::SPLAT_VECTOR};
 
 RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                                          const RISCVSubtarget &STI)
@@ -1272,17 +1273,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          VT, Custom);
       setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
       setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
+      setOperationAction(ISD::EXPERIMENTAL_VP_SPLAT, VT, Custom);
 
       setOperationAction(ISD::FCOPYSIGN, VT, Legal);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
       setOperationAction(ZvfbfaVPOps, VT, Custom);
 
-      MVT EltVT = VT.getVectorElementType();
-      if (isTypeLegal(EltVT))
-        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT}, VT,
-                           Custom);
-      else
-        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT},
-                           EltVT, Custom);
       setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
                           ISD::MGATHER, ISD::MSCATTER, ISD::VP_LOAD,
                           ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
@@ -4870,7 +4866,7 @@ static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL,
 
   if (VT.isFloatingPoint()) {
     if ((EltVT == MVT::f16 && !Subtarget.hasStdExtZvfh()) ||
-        EltVT == MVT::bf16) {
+        (EltVT == MVT::bf16 && !Subtarget.hasVInstructionsBF16())) {
       if ((EltVT == MVT::bf16 && Subtarget.hasStdExtZfbfmin()) ||
           (EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin()))
         Scalar = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Scalar);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
index c94cdadc8ca59..82e199b4969db 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVFBFMIN
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZFBFMIN-ZVFBFMIN
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVFBFMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVFBFA
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVFBFA
 
 define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) {
 ; ZFBFMIN-ZVFBFMIN-LABEL: splat_v8bf16:
@@ -18,6 +20,12 @@ define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) {
 ; ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %a = insertelement <8 x bfloat> poison, bfloat %y, i32 0
   %b = shufflevector <8 x bfloat> %a, <8 x bfloat> poison, <8 x i32> zeroinitializer
   ret <8 x bfloat> %b
@@ -37,6 +45,12 @@ define <16 x bfloat> @splat_16bf16(ptr %x, bfloat %y) {
 ; ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %a = insertelement <16 x bfloat> poison, bfloat %y, i32 0
   %b = shufflevector <16 x bfloat> %a, <16 x bfloat> poison, <16 x i32> zeroinitializer
   ret <16 x bfloat> %b
@@ -58,6 +72,12 @@ define <64 x bfloat> @splat_64bf16(ptr %x, bfloat %y) {
 ; ZVFBFMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_64bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %a = insertelement <64 x bfloat> poison, bfloat %y, i32 0
   %b = shufflevector <64 x bfloat> %a, <64 x bfloat> poison, <64 x i32> zeroinitializer
   ret <64 x bfloat> %b
@@ -75,6 +95,12 @@ define <8 x bfloat> @splat_zero_v8bf16(ptr %x) {
 ; ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_zero_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.i v8, 0
+; ZVFBFA-NEXT:    ret
   ret <8 x bfloat> splat (bfloat 0.0)
 }
 
@@ -90,6 +116,12 @@ define <16 x bfloat> @splat_zero_16bf16(ptr %x) {
 ; ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_zero_16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.i v8, 0
+; ZVFBFA-NEXT:    ret
   ret <16 x bfloat> splat (bfloat 0.0)
 }
 
@@ -107,6 +139,13 @@ define <8 x bfloat> @splat_negzero_v8bf16(ptr %x) {
 ; ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_negzero_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 1048568
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    ret
   ret <8 x bfloat> splat (bfloat -0.0)
 }
 
@@ -124,5 +163,12 @@ define <16 x bfloat> @splat_negzero_16bf16(ptr %x) {
 ; ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: splat_negzero_16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 1048568
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    ret
   ret <16 x bfloat> splat (bfloat -0.0)
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll
index 40e337c811e8b..7901f8c290543 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH_RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH_RV64
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA,ZVFBFA_RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA,ZVFBFA_RV64
 
 define <1 x i8> @vp_splat_v1i8(i8 %val, <1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_splat_v1i8:
@@ -183,97 +185,275 @@ define <16 x i32> @vp_splat_v16i32(i32 %val, <16 x i1> %m, i32 zeroext %evl) {
 }
 
 define <1 x i64> @vp_splat_v1i64(i64 %val, <1 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_v1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
+; ZVFH_RV32-LABEL: vp_splat_v1i64:
+; ZVFH_RV32:       # %bb.0:
+; ZVFH_RV32-NEXT:    addi sp, sp, -16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH_RV32-NEXT:    sw a0, 8(sp)
+; ZVFH_RV32-NEXT:    sw a1, 12(sp)
+; ZVFH_RV32-NEXT:    addi a0, sp, 8
+; ZVFH_RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVFH_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFH_RV32-NEXT:    addi sp, sp, 16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH_RV32-NEXT:    ret
 ;
-; RV64-LABEL: vp_splat_v1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
+; ZVFH_RV64-LABEL: vp_splat_v1i64:
+; ZVFH_RV64:       # %bb.0:
+; ZVFH_RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; ZVFH_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFH_RV64-NEXT:    ret
+;
+; ZVFBFA_RV32-LABEL: vp_splat_v1i64:
+; ZVFBFA_RV32:       # %bb.0:
+; ZVFBFA_RV32-NEXT:    addi sp, sp, -16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA_RV32-NEXT:    sw a0, 8(sp)
+; ZVFBFA_RV32-NEXT:    sw a1, 12(sp)
+; ZVFBFA_RV32-NEXT:    addi a0, sp, 8
+; ZVFBFA_RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVFBFA_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFBFA_RV32-NEXT:    addi sp, sp, 16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA_RV32-NEXT:    ret
+;
+; ZVFBFA_RV64-LABEL: vp_splat_v1i64:
+; ZVFBFA_RV64:       # %bb.0:
+; ZVFBFA_RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; ZVFBFA_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFBFA_RV64-NEXT:    ret
   %splat = call <1 x i64> @llvm.experimental.vp.splat.v1i64(i64 %val, <1 x i1> %m, i32 %evl)
   ret <1 x i64> %splat
 }
 
 define <2 x i64> @vp_splat_v2i64(i64 %val, <2 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_v2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
+; ZVFH_RV32-LABEL: vp_splat_v2i64:
+; ZVFH_RV32:       # %bb.0:
+; ZVFH_RV32-NEXT:    addi sp, sp, -16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH_RV32-NEXT:    sw a0, 8(sp)
+; ZVFH_RV32-NEXT:    sw a1, 12(sp)
+; ZVFH_RV32-NEXT:    addi a0, sp, 8
+; ZVFH_RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVFH_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFH_RV32-NEXT:    addi sp, sp, 16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH_RV32-NEXT:    ret
+;
+; ZVFH_RV64-LABEL: vp_splat_v2i64:
+; ZVFH_RV64:       # %bb.0:
+; ZVFH_RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; ZVFH_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFH_RV64-NEXT:    ret
+;
+; ZVFBFA_RV32-LABEL: vp_splat_v2i64:
+; ZVFBFA_RV32:       # %bb.0:
+; ZVFBFA_RV32-NEXT:    addi sp, sp, -16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA_RV32-NEXT:    sw a0, 8(sp)
+; ZVFBFA_RV32-NEXT:    sw a1, 12(sp)
+; ZVFBFA_RV32-NEXT:    addi a0, sp, 8
+; ZVFBFA_RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; ZVFBFA_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFBFA_RV32-NEXT:    addi sp, sp, 16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA_RV32-NEXT:    ret
 ;
-; RV64-LABEL: vp_splat_v2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
+; ZVFBFA_RV64-LABEL: vp_splat_v2i64:
+; ZVFBFA_RV64:       # %bb.0:
+; ZVFBFA_RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; ZVFBFA_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFBFA_RV64-NEXT:    ret
   %splat = call <2 x i64> @llvm.experimental.vp.splat.v2i64(i64 %val, <2 x i1> %m, i32 %evl)
   ret <2 x i64> %splat
 }
 
 define <4 x i64> @vp_splat_v4i64(i64 %val, <4 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_v4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
+; ZVFH_RV32-LABEL: vp_splat_v4i64:
+; ZVFH_RV32:       # %bb.0:
+; ZVFH_RV32-NEXT:    addi sp, sp, -16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH_RV32-NEXT:    sw a0, 8(sp)
+; ZVFH_RV32-NEXT:    sw a1, 12(sp)
+; ZVFH_RV32-NEXT:    addi a0, sp, 8
+; ZVFH_RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVFH_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFH_RV32-NEXT:    addi sp, sp, 16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH_RV32-NEXT:    ret
 ;
-; RV64-LABEL: vp_splat_v4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
+; ZVFH_RV64-LABEL: vp_splat_v4i64:
+; ZVFH_RV64:       # %bb.0:
+; ZVFH_RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; ZVFH_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFH_RV64-NEXT:    ret
+;
+; ZVFBFA_RV32-LABEL: vp_splat_v4i64:
+; ZVFBFA_RV32:       # %bb.0:
+; ZVFBFA_RV32-NEXT:    addi sp, sp, -16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA_RV32-NEXT:    sw a0, 8(sp)
+; ZVFBFA_RV32-NEXT:    sw a1, 12(sp)
+; ZVFBFA_RV32-NEXT:    addi a0, sp, 8
+; ZVFBFA_RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVFBFA_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFBFA_RV32-NEXT:    addi sp, sp, 16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA_RV32-NEXT:    ret
+;
+; ZVFBFA_RV64-LABEL: vp_splat_v4i64:
+; ZVFBFA_RV64:       # %bb.0:
+; ZVFBFA_RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; ZVFBFA_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFBFA_RV64-NEXT:    ret
   %splat = call <4 x i64> @llvm.experimental.vp.splat.v4i64(i64 %val, <4 x i1> %m, i32 %evl)
   ret <4 x i64> %splat
 }
 
 define <8 x i64> @vp_splat_v8i64(i64 %val, <8 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vp_splat_v8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
+; ZVFH_RV32-LABEL: vp_splat_v8i64:
+; ZVFH_RV32:       # %bb.0:
+; ZVFH_RV32-NEXT:    addi sp, sp, -16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH_RV32-NEXT:    sw a0, 8(sp)
+; ZVFH_RV32-NEXT:    sw a1, 12(sp)
+; ZVFH_RV32-NEXT:    addi a0, sp, 8
+; ZVFH_RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; ZVFH_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFH_RV32-NEXT:    addi sp, sp, 16
+; ZVFH_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH_RV32-NEXT:    ret
+;
+; ZVFH_RV64-LABEL: vp_splat_v8i64:
+; ZVFH_RV64:       # %bb.0:
+; ZVFH_RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; ZVFH_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFH_RV64-NEXT:    ret
+;
+; ZVFBFA_RV32-LABEL: vp_splat_v8i64:
+; ZVFBFA_RV32:       # %bb.0:
+; ZVFBFA_RV32-NEXT:    addi sp, sp, -16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA_RV32-NEXT:    sw a0, 8(sp)
+; ZVFBFA_RV32-NEXT:    sw a1, 12(sp)
+; ZVFBFA_RV32-NEXT:    addi a0, sp, 8
+; ZVFBFA_RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; ZVFBFA_RV32-NEXT:    vlse64.v v8, (a0), zero
+; ZVFBFA_RV32-NEXT:    addi sp, sp, 16
+; ZVFBFA_RV32-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA_RV32-NEXT:    ret
 ;
-; RV64-LABEL: vp_splat_v8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    ret
+; ZVFBFA_RV64-LABEL: vp_splat_v8i64:
+; ZVFBFA_RV64:       # %bb.0:
+; ZVFBFA_RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; ZVFBFA_RV64-NEXT:    vmv.v.x v8, a0
+; ZVFBFA_RV64-NEXT:    ret
   %splat = call <8 x i64> @llvm.experimental.vp.splat.v8i64(i64 %val, <8 x i1> %m, i32 %evl)
   ret <8 x i64> %splat
 }
 
+define <1 x bfloat> @vp_splat_v1bf16(bfloat %val, <1 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vp_splat_v1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.w a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_v1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
+  %splat = call <1 x bfloat> @llvm.experimental.vp.splat.v1bf16(bfloat %val, <1 x i1> %m, i32 %evl)
+  ret <1 x bfloat> %splat
+}
+
+define <2 x bfloat> @vp_splat_v2bf16(bfloat %val, <2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vp_splat_v2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.w a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_v2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
+  %splat = call <2 x bfloat> @llvm.experimental.vp.splat.v2bf16(bfloat %val, <2 x i1> %m, i32 %evl)
+  ret <2 x bfloat> %splat
+}
+
+define <4 x bfloat> @vp_splat_v4bf16(bfloat %val, <4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vp_splat_v4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.w a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_v4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
+  %splat = call <4 x bfloat> @llvm.experimental.vp.splat.v4bf16(bfloat %val, <4 x i1> %m, i32 %evl)
+  ret <4 x bfloat> %splat
+}
+
+define <8 x bfloat> @vp_splat_v8bf16(bfloat %val, <8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vp_splat_v8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.w a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_v8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
+  %splat = call <8 x bfloat> @llvm.experimental.vp.splat.v8bf16(bfloat %val, <8 x i1> %m, i32 %evl)
+  ret <8 x bfloat> %splat
+}
+
+define <16 x bfloat> @vp_splat_v16bf16(bfloat %val, <16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vp_splat_v16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.w a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_v16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
+  %splat = call <16 x bfloat> @llvm.experimental.vp.splat.v16bf16(bfloat %val, <16 x i1> %m, i32 %evl)
+  ret <16 x bfloat> %splat
+}
+
+define <32 x bfloat> @vp_splat_v32bf16(bfloat %val, <32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vp_splat_v32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.w a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a1
+; ZVFH-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_v32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
+  %splat = call <32 x bfloat> @llvm.experimental.vp.splat.v32bf16(bfloat %val, <32 x i1> %m, i32 %evl)
+  ret <32 x bfloat> %splat
+}
+
 define <1 x half> @vp_splat_v1f16(half %val, <1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_splat_v1f16:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index abd00b647e374..c4232915895cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -617,25 +617,22 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; ZVFBFA-NEXT:    slli a0, a0, 3
 ; ZVFBFA-NEXT:    sub sp, sp, a0
 ; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFBFA-NEXT:    fmv.x.h a0, fa0
-; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFBFA-NEXT:    addi a1, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m8, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v16, fa0
 ; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFBFA-NEXT:    addi a0, sp, 16
-; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFBFA-NEXT:    vfadd.vv v0, v8, v0
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vfadd.vv v16, v0, v24
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFBFA-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 633a201c0131a..1ab2209647c80 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -120,9 +120,8 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -165,9 +164,8 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_commute:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -210,9 +208,8 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -255,9 +252,8 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -376,9 +372,8 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -421,9 +416,8 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv2bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfmv.v.f v9, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -542,9 +536,8 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfmv.v.f v12, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -587,9 +580,8 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv4bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfmv.v.f v12, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -708,9 +700,8 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfmv.v.f v16, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -753,9 +744,8 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv8bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfmv.v.f v16, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -874,9 +864,8 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfmv.v.f v24, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -919,9 +908,8 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ;
 ; ZVFBFA-LABEL: vfadd_vf_nxv16bf16_unmasked:
 ; ZVFBFA:       # %bb.0:
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfmv.v.f v24, fa0
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
@@ -1369,26 +1357,22 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; ZVFBFA-NEXT:    addi sp, sp, -16
 ; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFBFA-NEXT:    csrr a1, vlenb
-; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    slli a1, a1, 3
 ; ZVFBFA-NEXT:    sub sp, sp, a1
-; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m8, ta, ma
 ; ZVFBFA-NEXT:    vmv1r.v v7, v0
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vfmv.v.f v24, fa0
 ; ZVFBFA-NEXT:    csrr a2, vlenb
-; ZVFBFA-NEXT:    vmv.v.x v24, a1
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8alt, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFBFA-NEXT:    sltu a2, a0, a3
 ; ZVFBFA-NEXT:    addi a2, a2, -1
 ; ZVFBFA-NEXT:    and a2, a2, a3
-; ZVFBFA-NEXT:    csrr a3, vlenb
-; ZVFBFA-NEXT:    slli a3, a3, 3
-; ZVFBFA-NEXT:    add a3, sp, a3
-; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    addi a3, sp, 16
 ; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
 ; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
@@ -1402,24 +1386,17 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; ZVFBFA-NEXT:    mv a0, a1
 ; ZVFBFA-NEXT:  .LBB24_2:
 ; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFBFA-NEXT:    addi a0, sp, 16
-; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; ZVFBFA-NEXT:    csrr a0, vlenb
-; ZVFBFA-NEXT:    slli a0, a0, 3
-; ZVFBFA-NEXT:    add a0, sp, a0
-; ZVFBFA-NEXT:    addi a0, a0, 16
-; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
-; ZVFBFA-NEXT:    addi a0, sp, 16
-; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFBFA-NEXT:    csrr a0, vlenb
-; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    slli a0, a0, 3
 ; ZVFBFA-NEXT:    add sp, sp, a0
 ; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFBFA-NEXT:    addi sp, sp, 16
@@ -1542,15 +1519,14 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFBFA-NEXT:    slli a1, a1, 3
 ; ZVFBFA-NEXT:    sub sp, sp, a1
 ; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v16, fa0
 ; ZVFBFA-NEXT:    csrr a2, vlenb
-; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFBFA-NEXT:    vmset.m v24
-; ZVFBFA-NEXT:    vmv.v.x v16, a1
 ; ZVFBFA-NEXT:    slli a1, a2, 1
 ; ZVFBFA-NEXT:    srli a2, a2, 2
 ; ZVFBFA-NEXT:    sub a3, a0, a1
-; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8alt, mf2, ta, ma
 ; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFBFA-NEXT:    sltu a2, a0, a3
 ; ZVFBFA-NEXT:    addi a2, a2, -1
@@ -1568,13 +1544,13 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; ZVFBFA-NEXT:  # %bb.1:
 ; ZVFBFA-NEXT:    mv a0, a1
 ; ZVFBFA-NEXT:  .LBB25_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
 ; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFBFA-NEXT:    addi a0, sp, 16
-; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
 ; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFBFA-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
index 23fc96cd1b6e0..b8b2ba7c5e5d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splat.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFHMIN
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZFMIN
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZFMIN
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVFBFA
 
 define <vscale x 1 x i8> @vp_splat_nxv1i8(i8 %val, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_splat_nxv1i8:
@@ -292,6 +294,12 @@ define <vscale x 1 x bfloat> @vp_splat_nxv1bf16(bfloat %val, <vscale x 1 x i1> %
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 1 x bfloat> @llvm.experimental.vp.splat.nxv1bf16(bfloat %val, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %splat
 }
@@ -310,6 +318,12 @@ define <vscale x 2 x bfloat> @vp_splat_nxv2bf16(bfloat %val, <vscale x 2 x i1> %
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 2 x bfloat> @llvm.experimental.vp.splat.nxv2bf16(bfloat %val, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %splat
 }
@@ -328,6 +342,12 @@ define <vscale x 4 x bfloat> @vp_splat_nxv4bf16(bfloat %val, <vscale x 4 x i1> %
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 4 x bfloat> @llvm.experimental.vp.splat.nxv4bf16(bfloat %val, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %splat
 }
@@ -346,6 +366,12 @@ define <vscale x 8 x bfloat> @vp_splat_nxv8bf16(bfloat %val, <vscale x 8 x i1> %
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 8 x bfloat> @llvm.experimental.vp.splat.nxv8bf16(bfloat %val, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %splat
 }
@@ -364,6 +390,12 @@ define <vscale x 16 x bfloat> @vp_splat_nxv16bf16(bfloat %val, <vscale x 16 x i1
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 16 x bfloat> @llvm.experimental.vp.splat.nxv16bf16(bfloat %val, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %splat
 }
@@ -382,6 +414,12 @@ define <vscale x 32 x bfloat> @vp_splat_nxv32bf16(bfloat %val, <vscale x 32 x i1
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 32 x bfloat> @llvm.experimental.vp.splat.nxv32bf16(bfloat %val, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %splat
 }
@@ -406,6 +444,13 @@ define <vscale x 1 x half> @vp_splat_nxv1f16(half %val, <vscale x 1 x i1> %m, i3
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a1
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 1 x half> @llvm.experimental.vp.splat.nxv1f16(half %val, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %splat
 }
@@ -430,6 +475,13 @@ define <vscale x 2 x half> @vp_splat_nxv2f16(half %val, <vscale x 2 x i1> %m, i3
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a1
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 2 x half> @llvm.experimental.vp.splat.nxv2f16(half %val, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %splat
 }
@@ -454,6 +506,13 @@ define <vscale x 4 x half> @vp_splat_nxv4f16(half %val, <vscale x 4 x i1> %m, i3
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a1
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 4 x half> @llvm.experimental.vp.splat.nxv4f16(half %val, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %splat
 }
@@ -478,6 +537,13 @@ define <vscale x 8 x half> @vp_splat_nxv8f16(half %val, <vscale x 8 x i1> %m, i3
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a1
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 8 x half> @llvm.experimental.vp.splat.nxv8f16(half %val, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %splat
 }
@@ -502,6 +568,13 @@ define <vscale x 16 x half> @vp_splat_nxv16f16(half %val, <vscale x 16 x i1> %m,
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a1
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 16 x half> @llvm.experimental.vp.splat.nxv16f16(half %val, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %splat
 }
@@ -526,6 +599,13 @@ define <vscale x 32 x half> @vp_splat_nxv32f16(half %val, <vscale x 32 x i1> %m,
 ; ZFMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZFMIN-NEXT:    vmv.v.x v8, a1
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vp_splat_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a1
+; ZVFBFA-NEXT:    ret
   %splat = call <vscale x 32 x half> @llvm.experimental.vp.splat.nxv32f16(half %val, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %splat
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-bf16.ll
index af9881aca03bc..24ed31cc55225 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsplats-bf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-bf16.ll
@@ -7,6 +7,10 @@
 ; RUN:   | FileCheck %s --check-prefixes=NOZFBFMIN
 ; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zvfbfmin,+v -target-abi lp64d -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=NOZFBFMIN
+; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+experimental-zvfbfa,+v -target-abi ilp32d -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+experimental-zvfbfa,+v -target-abi lp64d -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=ZVFBFA
 
 define <vscale x 8 x bfloat> @vsplat_nxv8bf16(bfloat %f) {
 ; CHECK-LABEL: vsplat_nxv8bf16:
@@ -22,6 +26,12 @@ define <vscale x 8 x bfloat> @vsplat_nxv8bf16(bfloat %f) {
 ; NOZFBFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; NOZFBFMIN-NEXT:    vmv.v.x v8, a0
 ; NOZFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vsplat_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfmv.v.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %f, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   ret <vscale x 8 x bfloat> %splat
@@ -39,6 +49,12 @@ define <vscale x 8 x bfloat> @vsplat_zero_nxv8bf16() {
 ; NOZFBFMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; NOZFBFMIN-NEXT:    vmv.v.i v8, 0
 ; NOZFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vsplat_zero_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.i v8, 0
+; ZVFBFA-NEXT:    ret
   ret <vscale x 8 x bfloat> splat (bfloat zeroinitializer)
 }
 
@@ -56,5 +72,12 @@ define <vscale x 8 x bfloat> @vsplat_negzero_nxv8bf16() {
 ; NOZFBFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; NOZFBFMIN-NEXT:    vmv.v.x v8, a0
 ; NOZFBFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vsplat_negzero_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    lui a0, 1048568
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    ret
   ret <vscale x 8 x bfloat> splat (bfloat -0.0)
 }

From 53e3f8e3c50ee28e33181514933c5632a001033b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 14:50:16 +0000
Subject: [PATCH 020/105] [XRay] Prefix setting XRAY_OPTIONS with env

So setting the environment variable works with the new internal shell.
This does not fix all the XRay tests because some of them are using
subshells and need to be rewritten to not use subshells.
---
 compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp | 6 +++---
 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp  | 2 +-
 compiler-rt/test/xray/TestCases/Posix/c-test.cpp          | 2 +-
 .../xray/TestCases/Posix/common-trampoline-alignment.cpp  | 2 +-
 .../TestCases/Posix/custom-event-handler-alignment.cpp    | 2 +-
 .../test/xray/TestCases/Posix/custom-event-logging.cpp    | 4 ++--
 .../test/xray/TestCases/Posix/fdr-mode-inmemory.cpp       | 4 ++--
 .../test/xray/TestCases/Posix/fdr-mode-multiple.cpp       | 4 ++--
 compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp        | 8 ++++----
 .../test/xray/TestCases/Posix/fdr-thread-order.cpp        | 2 +-
 .../test/xray/TestCases/Posix/fork_basic_logging.cpp      | 2 +-
 .../xray/TestCases/Posix/profiling-multi-threaded.cpp     | 4 ++--
 .../xray/TestCases/Posix/profiling-single-threaded.cpp    | 4 ++--
 compiler-rt/test/xray/TestCases/Posix/quiet-start.cpp     | 6 +++---
 14 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp
index 1440460c9de27..47e7a0710131d 100644
--- a/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp
@@ -3,7 +3,7 @@
 
 // RUN: %clangxx_xray -std=c++11 %s -o %t -g
 // RUN: rm -f basic-filtering-*
-// RUN: XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1 \
+// RUN: env XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1 \
 // RUN:     xray_logfile_base=basic-filtering- \
 // RUN:     xray_naive_log_func_duration_threshold_us=1000 \
 // RUN:     xray_naive_log_max_stack_depth=2" %run %t 2>&1 | \
@@ -14,9 +14,9 @@
 // RUN: rm -f basic-filtering-*
 //
 // Now check support for the XRAY_BASIC_OPTIONS environment variable.
-// RUN: XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1 \
+// RUN: env XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1 \
 // RUN:     xray_logfile_base=basic-filtering-" \
-// RUN: XRAY_BASIC_OPTIONS="func_duration_threshold_us=1000 max_stack_depth=2" \
+// RUN: env XRAY_BASIC_OPTIONS="func_duration_threshold_us=1000 max_stack_depth=2" \
 // RUN:     %run %t 2>&1 | FileCheck %s
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
 // RUN:     "`ls basic-filtering-* | head -1`" | \
diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
index d40dcd808bcba..eb76d5f95b6cd 100644
--- a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
@@ -4,7 +4,7 @@
 // RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
 // RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
 
-// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s
+// RUN: env XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s
 // RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s
 // RUN: rm basic-mode-dso-*
 
diff --git a/compiler-rt/test/xray/TestCases/Posix/c-test.cpp b/compiler-rt/test/xray/TestCases/Posix/c-test.cpp
index 6427566186514..d7c766682f38f 100644
--- a/compiler-rt/test/xray/TestCases/Posix/c-test.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/c-test.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_xray -g -fxray-modes=xray-basic,xray-fdr,xray-profiling -o %t %s
 // RUN: rm -f xray-log.c-test.*
-// RUN: XRAY_OPTIONS=patch_premain=true:verbosity=1:xray_mode=xray-basic %t \
+// RUN: env XRAY_OPTIONS=patch_premain=true:verbosity=1:xray_mode=xray-basic %t \
 // RUN:     2>&1 | FileCheck %s
 // RUN: rm -f xray-log.c-test.*
 //
diff --git a/compiler-rt/test/xray/TestCases/Posix/common-trampoline-alignment.cpp b/compiler-rt/test/xray/TestCases/Posix/common-trampoline-alignment.cpp
index f9189644b40b8..d072e57f5cdbc 100644
--- a/compiler-rt/test/xray/TestCases/Posix/common-trampoline-alignment.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/common-trampoline-alignment.cpp
@@ -2,7 +2,7 @@
 // expect 16-byte alignment of the stack.
 //
 // RUN: %clangxx_xray -std=c++11 %s -o %t
-// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1" \
+// RUN: env XRAY_OPTIONS="patch_premain=false verbosity=1" \
 // RUN:     %run %t 2>&1
 // REQUIRES: x86_64-target-arch
 // REQUIRES: built-in-llvm-tree
diff --git a/compiler-rt/test/xray/TestCases/Posix/custom-event-handler-alignment.cpp b/compiler-rt/test/xray/TestCases/Posix/custom-event-handler-alignment.cpp
index 9c61cba83b0da..19154820d5089 100644
--- a/compiler-rt/test/xray/TestCases/Posix/custom-event-handler-alignment.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/custom-event-handler-alignment.cpp
@@ -2,7 +2,7 @@
 // calls.
 //
 // RUN: %clangxx_xray -std=c++11 %s -o %t
-// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1" \
+// RUN: env XRAY_OPTIONS="patch_premain=false verbosity=1" \
 // RUN:     %run %t 2>&1
 // REQUIRES: x86_64-target-arch
 // REQUIRES: built-in-llvm-tree
diff --git a/compiler-rt/test/xray/TestCases/Posix/custom-event-logging.cpp b/compiler-rt/test/xray/TestCases/Posix/custom-event-logging.cpp
index 30f4fffa429ee..cbdfe6c57b38e 100644
--- a/compiler-rt/test/xray/TestCases/Posix/custom-event-logging.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/custom-event-logging.cpp
@@ -1,9 +1,9 @@
 // Use the clang feature for custom xray event logging.
 //
 // RUN: %clangxx_xray -std=c++11 %s -o %t
-// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1 xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s
+// RUN: env XRAY_OPTIONS="patch_premain=false verbosity=1 xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s
 // RUN: %clangxx_xray -std=c++11 -fpic -fpie %s -o %t
-// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1 xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s
+// RUN: env XRAY_OPTIONS="patch_premain=false verbosity=1 xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s
 // FIXME: Support this in non-x86_64 as well
 // REQUIRES: target={{(aarch64|x86_64)-.*linux.*}}
 // REQUIRES: built-in-llvm-tree
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp
index 6c94dbd8b21d2..0517c9a9b47e5 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp
@@ -2,9 +2,9 @@
 // RUN: rm -rf %t.dir
 // RUN: mkdir -p %t.dir
 // RUN: cd %t.dir
-// RUN: XRAY_OPTIONS="patch_premain=false xray_logfile_base=fdr-inmemory-test- \
+// RUN: env XRAY_OPTIONS="patch_premain=false xray_logfile_base=fdr-inmemory-test- \
 // RUN:     verbosity=1" \
-// RUN: XRAY_FDR_OPTIONS="no_file_flush=true func_duration_threshold_us=0" \
+// RUN: env XRAY_FDR_OPTIONS="no_file_flush=true func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
 // RUN: FILES=`find %t.dir -name 'fdr-inmemory-test-*' | wc -l`
 // RUN: [ $FILES -eq 0 ]
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp
index f9288d9002de0..a545f540bc6e7 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp
@@ -2,9 +2,9 @@
 // RUN: rm -rf %t.dir
 // RUN: mkdir -p %t.dir
 // RUN: cd %t.dir
-// RUN: XRAY_OPTIONS="patch_premain=false xray_logfile_base=fdr-inmemory-test- \
+// RUN: env XRAY_OPTIONS="patch_premain=false xray_logfile_base=fdr-inmemory-test- \
 // RUN:     verbosity=1" \
-// RUN: XRAY_FDR_OPTIONS="no_file_flush=true func_duration_threshold_us=0" \
+// RUN: env XRAY_FDR_OPTIONS="no_file_flush=true func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
 // RUN: FILES=`find %t.dir -name 'fdr-inmemory-test-*' | wc -l`
 // RUN: [ $FILES -eq 0 ]
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
index 0ee8aaa755d5a..e74cacd1fe9ff 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
@@ -1,14 +1,14 @@
 // RUN: %clangxx_xray -g -std=c++11 %s -o %t
 // RUN: rm -f fdr-logging-test-*
 // RUN: rm -f fdr-unwrite-test-*
-// RUN: XRAY_OPTIONS="patch_premain=false xray_logfile_base=fdr-logging-test- \
+// RUN: env XRAY_OPTIONS="patch_premain=false xray_logfile_base=fdr-logging-test- \
 // RUN:     xray_mode=xray-fdr verbosity=1" \
-// RUN: XRAY_FDR_OPTIONS="func_duration_threshold_us=0" \
+// RUN: env XRAY_FDR_OPTIONS="func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
-// RUN: XRAY_OPTIONS="patch_premain=false \
+// RUN: env XRAY_OPTIONS="patch_premain=false \
 // RUN:     xray_logfile_base=fdr-unwrite-test- xray_mode=xray-fdr \
 // RUN:     verbosity=1" \
-// RUN: XRAY_FDR_OPTIONS="func_duration_threshold_us=5000" \
+// RUN: env XRAY_FDR_OPTIONS="func_duration_threshold_us=5000" \
 // RUN:     %run %t 2>&1 | FileCheck %s
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
 // RUN:     "`ls fdr-logging-test-* | head -1`" \
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-thread-order.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-thread-order.cpp
index 85284fc27ab38..d3fd2536ead00 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-thread-order.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-thread-order.cpp
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t && mkdir %t
 // RUN: %clangxx_xray -g -std=c++11 %s -o %t.exe
-// RUN: XRAY_OPTIONS="patch_premain=false \
+// RUN: env XRAY_OPTIONS="patch_premain=false \
 // RUN:    xray_logfile_base=%t/ xray_mode=xray-fdr verbosity=1" \
 // RUN:    XRAY_FDR_OPTIONS=func_duration_threshold_us=0 %run %t.exe 2>&1 | \
 // RUN:    FileCheck %s
diff --git a/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp b/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp
index 58f310e3a1083..6f201a329db24 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp
@@ -1,7 +1,7 @@
 // Check that when forking in basic logging mode, we get the different tids for child and parent
 // RUN: %clangxx_xray -g -std=c++11 %s -o %t
 // RUN: rm -f fork-basic-logging-test-*
-// RUN: XRAY_OPTIONS="patch_premain=true xray_logfile_base=fork-basic-logging-test- \
+// RUN: env XRAY_OPTIONS="patch_premain=true xray_logfile_base=fork-basic-logging-test- \
 // RUN:     xray_mode=xray-basic verbosity=1 xray_naive_log_func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
diff --git a/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp b/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp
index b850c053681a1..5d00d0f02b3e1 100644
--- a/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp
@@ -4,9 +4,9 @@
 // FIXME: Make -fxray-modes=xray-profiling part of the default?
 // RUN: %clangxx_xray -std=c++11 %s -o %t -fxray-modes=xray-profiling
 // RUN: rm -f xray-log.profiling-multi-*
-// RUN: XRAY_OPTIONS=verbosity=1 \
+// RUN: env XRAY_OPTIONS=verbosity=1 \
 // RUN:     XRAY_PROFILING_OPTIONS=no_flush=1 %run %t
-// RUN: XRAY_OPTIONS=verbosity=1 %run %t
+// RUN: env XRAY_OPTIONS=verbosity=1 %run %t
 // RUN: PROFILES=`ls xray-log.profiling-multi-* | wc -l`
 // RUN: [ $PROFILES -eq 1 ]
 // RUN: rm -f xray-log.profiling-multi-*
diff --git a/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp b/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp
index b2359607379d6..d0226613db7c1 100644
--- a/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp
@@ -4,9 +4,9 @@
 // FIXME: Make -fxray-modes=xray-profiling part of the default?
 // RUN: %clangxx_xray -std=c++11 %s -o %t -fxray-modes=xray-profiling
 // RUN: rm -f xray-log.profiling-single-*
-// RUN: XRAY_OPTIONS=verbosity=1 \
+// RUN: env XRAY_OPTIONS=verbosity=1 \
 // RUN:     XRAY_PROFILING_OPTIONS=no_flush=true %run %t
-// RUN: XRAY_OPTIONS=verbosity=1 %run %t
+// RUN: env XRAY_OPTIONS=verbosity=1 %run %t
 // RUN: PROFILES=`ls xray-log.profiling-single-* | wc -l`
 // RUN: [ $PROFILES -eq 2 ]
 // RUN: rm -f xray-log.profiling-single-*
diff --git a/compiler-rt/test/xray/TestCases/Posix/quiet-start.cpp b/compiler-rt/test/xray/TestCases/Posix/quiet-start.cpp
index 48830017047c0..d8e96e0297bd1 100644
--- a/compiler-rt/test/xray/TestCases/Posix/quiet-start.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/quiet-start.cpp
@@ -2,11 +2,11 @@
 // instrumentation sleds.
 //
 // RUN: %clangxx -std=c++11 %s -o %t %xraylib
-// RUN: XRAY_OPTIONS="patch_premain=true verbosity=1" %run %t 2>&1 | \
+// RUN: env XRAY_OPTIONS="patch_premain=true verbosity=1" %run %t 2>&1 | \
 // RUN:    FileCheck %s --check-prefix NOISY
-// RUN: XRAY_OPTIONS="patch_premain=true verbosity=0" %run %t 2>&1 | \
+// RUN: env XRAY_OPTIONS="patch_premain=true verbosity=0" %run %t 2>&1 | \
 // RUN:    FileCheck %s --check-prefix QUIET
-// RUN: XRAY_OPTIONS="" %run %t 2>&1 | FileCheck %s --check-prefix DEFAULT
+// RUN: env XRAY_OPTIONS="" %run %t 2>&1 | FileCheck %s --check-prefix DEFAULT
 
 // REQUIRES: built-in-llvm-tree
 

From e95c5c85113066fbf14307e31a533fdb1a7387ef Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 17 Nov 2025 16:01:25 +0100
Subject: [PATCH 021/105] [libc++] Refactor basic_string::__recommend (#162631)

This does a couple of things:
- code that is only useful for `shrink_to_fit` is moved into that
function
- `shrink_to_fit` is simplified a bit
- `__recommend` is renamed to better reflect what the function actually
does
- `__allocate_long_buffer` asserts that the passed capacity doesn't fit
into the SSO
---
 libcxx/include/string | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index 09fc6228c4fdb..c4806069d0b44 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2273,7 +2273,9 @@ private:
   // Allocate a buffer of __capacity size with __alloc and return it
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 __long
   __allocate_long_buffer(_Allocator& __alloc, size_type __capacity) {
-    auto __buffer = std::__allocate_at_least(__alloc, __recommend(__capacity) + 1);
+    _LIBCPP_ASSERT_INTERNAL(!__fits_in_sso(__capacity),
+                            "Trying to allocate long buffer for a capacity what would fit into the small buffer");
+    auto __buffer = std::__allocate_at_least(__alloc, __align_allocation_size(__capacity));
 
     if (__libcpp_is_constant_evaluated()) {
       for (size_type __i = 0; __i != __buffer.count; ++__i)
@@ -2365,16 +2367,20 @@ private:
     return (__s + (__a - 1)) & ~(__a - 1);
   }
   enum { __alignment = 8 };
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT {
-    if (__s < __min_cap) {
-      return static_cast<size_type>(__min_cap) - 1;
-    }
+
+  // This makes sure that we're using a capacity with some extra alignment, since allocators almost always over-align
+  // the allocations anyways, improving memory usage. More importantly, this ensures that the lowest bit is never set
+  // if __endian_factor == 2, allowing us to store whether we're in the long string inside the lowest bit.
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  __align_allocation_size(size_type __size) _NOEXCEPT {
+    _LIBCPP_ASSERT_INTERNAL(
+        !__fits_in_sso(__size), "Trying to align allocation of a size which would fit into the SSO");
     const size_type __boundary = sizeof(value_type) < __alignment ? __alignment / sizeof(value_type) : __endian_factor;
-    size_type __guess          = __align_it<__boundary>(__s + 1) - 1;
-    if (__guess == __min_cap)
+    size_type __guess          = __align_it<__boundary>(__size + 1);
+    if (__guess == __min_cap + 1)
       __guess += __endian_factor;
 
-    _LIBCPP_ASSERT_INTERNAL(__guess >= __s, "recommendation is below the requested size");
+    _LIBCPP_ASSERT_INTERNAL(__guess >= __size, "aligned allocation size is below the requested size");
     return __guess;
   }
 
@@ -2712,8 +2718,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__
   if (__delta_cap > __ms - __old_cap)
     __throw_length_error();
   pointer __old_p = __get_pointer();
-  size_type __cap =
-      __old_cap < __ms / 2 - __alignment ? __recommend(std::max(__old_cap + __delta_cap, 2 * __old_cap)) : __ms;
+  size_type __cap = __old_cap < __ms / 2 - __alignment ? std::max(__old_cap + __delta_cap, 2 * __old_cap) : __ms;
   __annotate_delete();
   auto __guard    = std::__make_scope_guard(__annotate_new_size(*this));
   __long __buffer = __allocate_long_buffer(__alloc_, __cap);
@@ -2750,8 +2755,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
   if (__delta_cap > __ms - __old_cap)
     this->__throw_length_error();
   pointer __old_p = __get_pointer();
-  size_type __cap =
-      __old_cap < __ms / 2 - __alignment ? __recommend(std::max(__old_cap + __delta_cap, 2 * __old_cap)) : __ms;
+  size_type __cap = __old_cap < __ms / 2 - __alignment ? std::max(__old_cap + __delta_cap, 2 * __old_cap) : __ms;
   __long __buffer = __allocate_long_buffer(__alloc_, __cap);
   if (__n_copy != 0)
     traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__old_p), __n_copy);
@@ -3417,18 +3421,15 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re
 
 template <class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::shrink_to_fit() _NOEXCEPT {
-  size_type __target_capacity = __recommend(size());
-  if (__target_capacity == capacity())
+  if (!__is_long())
     return;
 
-  _LIBCPP_ASSERT_INTERNAL(__is_long(), "Trying to shrink small string");
-
-  // We're a long string and we're shrinking into the small buffer.
   const auto __ptr  = __get_long_pointer();
   const auto __size = __get_long_size();
   const auto __cap  = __get_long_cap();
 
-  if (__fits_in_sso(__target_capacity)) {
+  // We're a long string and we're shrinking into the small buffer.
+  if (__fits_in_sso(__size)) {
     __annotation_guard __g(*this);
     __set_short_size(__size);
     traits_type::copy(std::__to_address(__get_short_pointer()), std::__to_address(__ptr), __size + 1);
@@ -3436,6 +3437,9 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat
     return;
   }
 
+  if (__align_allocation_size(__size) == __cap)
+    return;
+
 #  if _LIBCPP_HAS_EXCEPTIONS
   try {
 #  endif // _LIBCPP_HAS_EXCEPTIONS

From c7a9be81be9a08a4be208a89fa514ff0fe4de915 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 15:07:36 +0000
Subject: [PATCH 022/105] [XRay] Rewrite tests to not use subshells

So that they will actually function with the internal shell.
---
 compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp   | 6 ++++--
 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp    | 3 ++-
 compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp | 4 ++--
 compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp | 4 ++--
 compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp          | 6 ++++--
 .../test/xray/TestCases/Posix/fork_basic_logging.cpp        | 3 ++-
 .../test/xray/TestCases/Posix/profiling-multi-threaded.cpp  | 4 ++--
 .../test/xray/TestCases/Posix/profiling-single-threaded.cpp | 4 ++--
 8 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp
index 47e7a0710131d..7b46fe29a00e3 100644
--- a/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/basic-filtering.cpp
@@ -8,8 +8,9 @@
 // RUN:     xray_naive_log_func_duration_threshold_us=1000 \
 // RUN:     xray_naive_log_max_stack_depth=2" %run %t 2>&1 | \
 // RUN:     FileCheck %s
+// RUN: ls basic-filtering-* | head -1 | tr -d '\n' > %t.log
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:     "`ls basic-filtering-* | head -1`" | \
+// RUN:     "%{readfile:%t.log}" | \
 // RUN:     FileCheck %s --check-prefix TRACE
 // RUN: rm -f basic-filtering-*
 //
@@ -18,8 +19,9 @@
 // RUN:     xray_logfile_base=basic-filtering-" \
 // RUN: env XRAY_BASIC_OPTIONS="func_duration_threshold_us=1000 max_stack_depth=2" \
 // RUN:     %run %t 2>&1 | FileCheck %s
+// RUN: ls basic-filtering-* | head -1 | tr -d '\n' > %t.log
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:     "`ls basic-filtering-* | head -1`" | \
+// RUN:     "%{readfile:%t.log}" | \
 // RUN:     FileCheck %s --check-prefix TRACE
 // RUN: rm -f basic-filtering-*
 
diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
index eb76d5f95b6cd..954c7ef9626d8 100644
--- a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
@@ -5,7 +5,8 @@
 // RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
 
 // RUN: env XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s
-// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s
+// RUN: ls basic-mode-dso-* | head -1 | tr -d '\n' > %t.log
+// RUN: %llvm_xray account --format=csv --sort=funcid "%{readfile:%t.log}" | FileCheck --check-prefix=ACCOUNT %s
 // RUN: rm basic-mode-dso-*
 
 // REQUIRES: target={{(aarch64|x86_64)-.*}}
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp
index 0517c9a9b47e5..58a800a7c382a 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-inmemory.cpp
@@ -6,8 +6,8 @@
 // RUN:     verbosity=1" \
 // RUN: env XRAY_FDR_OPTIONS="no_file_flush=true func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
-// RUN: FILES=`find %t.dir -name 'fdr-inmemory-test-*' | wc -l`
-// RUN: [ $FILES -eq 0 ]
+// RUN: find %t.dir -name 'fdr-inmemory-test-*' | wc -l | tr -d '\n' > %t.file_count
+// RUN: %python -c "import sys; sys.exit(int(sys.argv[1]))" %{readfile:%t.file_count} 
 // RUN: rm -rf %t.dir
 //
 // REQUIRES: built-in-llvm-tree
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp
index a545f540bc6e7..ffe43c9b516d7 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode-multiple.cpp
@@ -6,8 +6,8 @@
 // RUN:     verbosity=1" \
 // RUN: env XRAY_FDR_OPTIONS="no_file_flush=true func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
-// RUN: FILES=`find %t.dir -name 'fdr-inmemory-test-*' | wc -l`
-// RUN: [ $FILES -eq 0 ]
+// RUN: find %t.dir -name 'fdr-inmemory-test-*' | wc -l | tr -d '\n' > %t.file_count
+// RUN: %python -c "import sys; sys.exit(int(sys.argv[1]))" %{readfile:%t.file_count} 
 // RUN: rm -rf %t.dir
 //
 // REQUIRES: built-in-llvm-tree
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
index e74cacd1fe9ff..fa83b5767e015 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
@@ -10,11 +10,13 @@
 // RUN:     verbosity=1" \
 // RUN: env XRAY_FDR_OPTIONS="func_duration_threshold_us=5000" \
 // RUN:     %run %t 2>&1 | FileCheck %s
+// RUN: ls fdr-logging-test-* | head -1 | tr -d '\n' > %t.log
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:     "`ls fdr-logging-test-* | head -1`" \
+// RUN:     "%{readfile:%t.log}" \
 // RUN:     | FileCheck %s --check-prefix=TRACE
+// RUN: ls fdr-unwrite-test-* | head -1 | tr -d '\n' > %t.log
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:     "`ls fdr-unwrite-test-* | head -1`" \
+// RUN:     "%{readfile:%t.log}" \
 // RUN:     | FileCheck %s --check-prefix=UNWRITE
 // RUN: rm fdr-logging-test-*
 // RUN: rm fdr-unwrite-test-*
diff --git a/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp b/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp
index 6f201a329db24..d0905c18cdc14 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fork_basic_logging.cpp
@@ -4,8 +4,9 @@
 // RUN: env XRAY_OPTIONS="patch_premain=true xray_logfile_base=fork-basic-logging-test- \
 // RUN:     xray_mode=xray-basic verbosity=1 xray_naive_log_func_duration_threshold_us=0" \
 // RUN:     %run %t 2>&1 | FileCheck %s
+// RUN: ls -S fork-basic-logging-test-* | head -1 | tr -d '\n' > %t.log
 // RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:     "`ls -S fork-basic-logging-test-* | head -1`" \
+// RUN:     "%{readfile:%t.log}" \
 // RUN:     | FileCheck %s --check-prefix=TRACE
 
 // REQUIRES: x86_64-target-arch
diff --git a/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp b/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp
index 5d00d0f02b3e1..b1b8edc659ea6 100644
--- a/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/profiling-multi-threaded.cpp
@@ -7,8 +7,8 @@
 // RUN: env XRAY_OPTIONS=verbosity=1 \
 // RUN:     XRAY_PROFILING_OPTIONS=no_flush=1 %run %t
 // RUN: env XRAY_OPTIONS=verbosity=1 %run %t
-// RUN: PROFILES=`ls xray-log.profiling-multi-* | wc -l`
-// RUN: [ $PROFILES -eq 1 ]
+// RUN: ls xray-log.profiling-multi-* | wc -l | tr -d '\n' > %t.profiles
+// RUN: %python -c "import sys; sys.exit(int(sys.argv[1]) - 1)" %{readfile:%t.profiles} 
 // RUN: rm -f xray-log.profiling-multi-*
 //
 // REQUIRES: built-in-llvm-tree
diff --git a/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp b/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp
index d0226613db7c1..d84f75bcac0da 100644
--- a/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/profiling-single-threaded.cpp
@@ -7,8 +7,8 @@
 // RUN: env XRAY_OPTIONS=verbosity=1 \
 // RUN:     XRAY_PROFILING_OPTIONS=no_flush=true %run %t
 // RUN: env XRAY_OPTIONS=verbosity=1 %run %t
-// RUN: PROFILES=`ls xray-log.profiling-single-* | wc -l`
-// RUN: [ $PROFILES -eq 2 ]
+// RUN: ls xray-log.profiling-single-* | wc -l | tr -d '\n' > %t.profiles
+// RUN: %python -c "import sys; sys.exit(int(sys.argv[1]) - 2)" %{readfile:%t.profiles} 
 // RUN: rm -f xray-log.profiling-single-*
 //
 // REQUIRES: built-in-llvm-tree

From e9743e24189d02b4ba71095c8581f2fb6412c140 Mon Sep 17 00:00:00 2001
From: Jakub Ficek <jakub.ficek@intel.com>
Date: Mon, 17 Nov 2025 16:18:22 +0100
Subject: [PATCH 023/105] [clang] Support constrained fp elementwise builtins
 (#166905)

Currently only __builtin_elementwise_sqrt emits contrained fp intrinsic
and propagates fp options.
This commit adds this support for the rest of elementwise builtins.
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 121 ++++--------
 .../CodeGen/strictfp-elementwise-builtins.cpp | 174 ++++++++----------
 .../CodeGenHLSL/builtins/exp-overloads.hlsl   |  80 ++++----
 clang/test/CodeGenHLSL/builtins/exp.hlsl      |  48 ++---
 .../CodeGenHLSL/builtins/exp2-overloads.hlsl  |  80 ++++----
 clang/test/CodeGenHLSL/builtins/exp2.hlsl     |  48 ++---
 clang/test/CodeGenHLSL/builtins/ldexp.hlsl    |  32 ++--
 clang/test/CodeGenHLSL/builtins/lit.hlsl      |  12 +-
 .../CodeGenHLSL/builtins/round-overloads.hlsl |  80 ++++----
 clang/test/CodeGenHLSL/builtins/round.hlsl    |  48 ++---
 10 files changed, 321 insertions(+), 402 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7e9273b20ad5b..93f691e4c2267 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2727,6 +2727,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_acosf16:
     case Builtin::BI__builtin_acosl:
     case Builtin::BI__builtin_acosf128:
+    case Builtin::BI__builtin_elementwise_acos:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::acos, Intrinsic::experimental_constrained_acos));
 
@@ -2738,6 +2739,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_asinf16:
     case Builtin::BI__builtin_asinl:
     case Builtin::BI__builtin_asinf128:
+    case Builtin::BI__builtin_elementwise_asin:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::asin, Intrinsic::experimental_constrained_asin));
 
@@ -2749,6 +2751,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_atanf16:
     case Builtin::BI__builtin_atanl:
     case Builtin::BI__builtin_atanf128:
+    case Builtin::BI__builtin_elementwise_atan:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::atan, Intrinsic::experimental_constrained_atan));
 
@@ -2760,6 +2763,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_atan2f16:
     case Builtin::BI__builtin_atan2l:
     case Builtin::BI__builtin_atan2f128:
+    case Builtin::BI__builtin_elementwise_atan2:
       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::atan2,
           Intrinsic::experimental_constrained_atan2));
@@ -2772,6 +2776,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_ceilf16:
     case Builtin::BI__builtin_ceill:
     case Builtin::BI__builtin_ceilf128:
+    case Builtin::BI__builtin_elementwise_ceil:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::ceil,
                                    Intrinsic::experimental_constrained_ceil));
@@ -2795,6 +2800,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_cosf16:
     case Builtin::BI__builtin_cosl:
     case Builtin::BI__builtin_cosf128:
+    case Builtin::BI__builtin_elementwise_cos:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::cos,
                                    Intrinsic::experimental_constrained_cos));
@@ -2807,6 +2813,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_coshf16:
     case Builtin::BI__builtin_coshl:
     case Builtin::BI__builtin_coshf128:
+    case Builtin::BI__builtin_elementwise_cosh:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::cosh, Intrinsic::experimental_constrained_cosh));
 
@@ -2818,6 +2825,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_expf16:
     case Builtin::BI__builtin_expl:
     case Builtin::BI__builtin_expf128:
+    case Builtin::BI__builtin_elementwise_exp:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::exp,
                                    Intrinsic::experimental_constrained_exp));
@@ -2830,6 +2838,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_exp2f16:
     case Builtin::BI__builtin_exp2l:
     case Builtin::BI__builtin_exp2f128:
+    case Builtin::BI__builtin_elementwise_exp2:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::exp2,
                                    Intrinsic::experimental_constrained_exp2));
@@ -2837,7 +2846,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_exp10f:
     case Builtin::BI__builtin_exp10f16:
     case Builtin::BI__builtin_exp10l:
-    case Builtin::BI__builtin_exp10f128: {
+    case Builtin::BI__builtin_exp10f128:
+    case Builtin::BI__builtin_elementwise_exp10: {
       // TODO: strictfp support
       if (Builder.getIsFPConstrained())
         break;
@@ -2863,6 +2873,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_floorf16:
     case Builtin::BI__builtin_floorl:
     case Builtin::BI__builtin_floorf128:
+    case Builtin::BI__builtin_elementwise_floor:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::floor,
                                    Intrinsic::experimental_constrained_floor));
@@ -2875,6 +2886,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_fmaf16:
     case Builtin::BI__builtin_fmal:
     case Builtin::BI__builtin_fmaf128:
+    case Builtin::BI__builtin_elementwise_fma:
       return RValue::get(emitTernaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::fma,
                                    Intrinsic::experimental_constrained_fma));
@@ -2939,7 +2951,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
       Value *Arg1 = EmitScalarExpr(E->getArg(0));
       Value *Arg2 = EmitScalarExpr(E->getArg(1));
-      return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
+      if (Builder.getIsFPConstrained()) {
+        Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_frem,
+                                       Arg1->getType());
+        return RValue::get(Builder.CreateConstrainedFPCall(F, {Arg1, Arg2}));
+      } else {
+        return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
+      }
     }
 
     case Builtin::BIlog:
@@ -2950,6 +2968,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_logf16:
     case Builtin::BI__builtin_logl:
     case Builtin::BI__builtin_logf128:
+    case Builtin::BI__builtin_elementwise_log:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::log,
                                    Intrinsic::experimental_constrained_log));
@@ -2962,6 +2981,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_log10f16:
     case Builtin::BI__builtin_log10l:
     case Builtin::BI__builtin_log10f128:
+    case Builtin::BI__builtin_elementwise_log10:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::log10,
                                    Intrinsic::experimental_constrained_log10));
@@ -2974,6 +2994,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_log2f16:
     case Builtin::BI__builtin_log2l:
     case Builtin::BI__builtin_log2f128:
+    case Builtin::BI__builtin_elementwise_log2:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::log2,
                                    Intrinsic::experimental_constrained_log2));
@@ -2985,6 +3006,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_nearbyintf:
     case Builtin::BI__builtin_nearbyintl:
     case Builtin::BI__builtin_nearbyintf128:
+    case Builtin::BI__builtin_elementwise_nearbyint:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                 Intrinsic::nearbyint,
                                 Intrinsic::experimental_constrained_nearbyint));
@@ -2997,6 +3019,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_powf16:
     case Builtin::BI__builtin_powl:
     case Builtin::BI__builtin_powf128:
+    case Builtin::BI__builtin_elementwise_pow:
       return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::pow,
                                    Intrinsic::experimental_constrained_pow));
@@ -3009,6 +3032,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_rintf16:
     case Builtin::BI__builtin_rintl:
     case Builtin::BI__builtin_rintf128:
+    case Builtin::BI__builtin_elementwise_rint:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::rint,
                                    Intrinsic::experimental_constrained_rint));
@@ -3021,6 +3045,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_roundf16:
     case Builtin::BI__builtin_roundl:
     case Builtin::BI__builtin_roundf128:
+    case Builtin::BI__builtin_elementwise_round:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::round,
                                    Intrinsic::experimental_constrained_round));
@@ -3033,6 +3058,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_roundevenf16:
     case Builtin::BI__builtin_roundevenl:
     case Builtin::BI__builtin_roundevenf128:
+    case Builtin::BI__builtin_elementwise_roundeven:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::roundeven,
                                    Intrinsic::experimental_constrained_roundeven));
@@ -3045,6 +3071,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_sinf16:
     case Builtin::BI__builtin_sinl:
     case Builtin::BI__builtin_sinf128:
+    case Builtin::BI__builtin_elementwise_sin:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::sin,
                                    Intrinsic::experimental_constrained_sin));
@@ -3057,6 +3084,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_sinhf16:
     case Builtin::BI__builtin_sinhl:
     case Builtin::BI__builtin_sinhf128:
+    case Builtin::BI__builtin_elementwise_sinh:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh));
 
@@ -3104,6 +3132,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_tanf16:
     case Builtin::BI__builtin_tanl:
     case Builtin::BI__builtin_tanf128:
+    case Builtin::BI__builtin_elementwise_tan:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan));
 
@@ -3115,6 +3144,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_tanhf16:
     case Builtin::BI__builtin_tanhl:
     case Builtin::BI__builtin_tanhf128:
+    case Builtin::BI__builtin_elementwise_tanh:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::tanh, Intrinsic::experimental_constrained_tanh));
 
@@ -3126,6 +3156,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_truncf16:
     case Builtin::BI__builtin_truncl:
     case Builtin::BI__builtin_truncf128:
+    case Builtin::BI__builtin_elementwise_trunc:
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
                                    Intrinsic::trunc,
                                    Intrinsic::experimental_constrained_trunc));
@@ -3177,11 +3208,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_ldexpf:
     case Builtin::BI__builtin_ldexpl:
     case Builtin::BI__builtin_ldexpf16:
-    case Builtin::BI__builtin_ldexpf128: {
+    case Builtin::BI__builtin_ldexpf128:
+    case Builtin::BI__builtin_elementwise_ldexp:
       return RValue::get(emitBinaryExpMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::ldexp,
           Intrinsic::experimental_constrained_ldexp));
-    }
     default:
       break;
     }
@@ -3977,100 +4008,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 
     return RValue::get(Result);
   }
-  case Builtin::BI__builtin_elementwise_acos:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::acos, "elt.acos"));
-  case Builtin::BI__builtin_elementwise_asin:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::asin, "elt.asin"));
-  case Builtin::BI__builtin_elementwise_atan:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::atan, "elt.atan"));
-  case Builtin::BI__builtin_elementwise_atan2:
-    return RValue::get(emitBuiltinWithOneOverloadedType<2>(
-        *this, E, Intrinsic::atan2, "elt.atan2"));
-  case Builtin::BI__builtin_elementwise_ceil:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::ceil, "elt.ceil"));
-  case Builtin::BI__builtin_elementwise_exp:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::exp, "elt.exp"));
-  case Builtin::BI__builtin_elementwise_exp2:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::exp2, "elt.exp2"));
-  case Builtin::BI__builtin_elementwise_exp10:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::exp10, "elt.exp10"));
-  case Builtin::BI__builtin_elementwise_ldexp: {
-    Value *Src = EmitScalarExpr(E->getArg(0));
-    Value *Exp = EmitScalarExpr(E->getArg(1));
-    Value *Result = Builder.CreateLdexp(Src, Exp, {}, "elt.ldexp");
-    return RValue::get(Result);
-  }
-  case Builtin::BI__builtin_elementwise_log:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::log, "elt.log"));
-  case Builtin::BI__builtin_elementwise_log2:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::log2, "elt.log2"));
-  case Builtin::BI__builtin_elementwise_log10:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::log10, "elt.log10"));
-  case Builtin::BI__builtin_elementwise_pow: {
-    return RValue::get(
-        emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::pow));
-  }
   case Builtin::BI__builtin_elementwise_bitreverse:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::bitreverse, "elt.bitreverse"));
-  case Builtin::BI__builtin_elementwise_cos:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::cos, "elt.cos"));
-  case Builtin::BI__builtin_elementwise_cosh:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::cosh, "elt.cosh"));
-  case Builtin::BI__builtin_elementwise_floor:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::floor, "elt.floor"));
   case Builtin::BI__builtin_elementwise_popcount:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::ctpop, "elt.ctpop"));
-  case Builtin::BI__builtin_elementwise_roundeven:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::roundeven, "elt.roundeven"));
-  case Builtin::BI__builtin_elementwise_round:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::round, "elt.round"));
-  case Builtin::BI__builtin_elementwise_rint:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::rint, "elt.rint"));
-  case Builtin::BI__builtin_elementwise_nearbyint:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::nearbyint, "elt.nearbyint"));
-  case Builtin::BI__builtin_elementwise_sin:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::sin, "elt.sin"));
-  case Builtin::BI__builtin_elementwise_sinh:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::sinh, "elt.sinh"));
-  case Builtin::BI__builtin_elementwise_tan:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::tan, "elt.tan"));
-  case Builtin::BI__builtin_elementwise_tanh:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::tanh, "elt.tanh"));
-  case Builtin::BI__builtin_elementwise_trunc:
-    return RValue::get(emitBuiltinWithOneOverloadedType<1>(
-        *this, E, Intrinsic::trunc, "elt.trunc"));
   case Builtin::BI__builtin_elementwise_canonicalize:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::canonicalize, "elt.canonicalize"));
   case Builtin::BI__builtin_elementwise_copysign:
     return RValue::get(
         emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::copysign));
-  case Builtin::BI__builtin_elementwise_fma:
-    return RValue::get(
-        emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fma));
   case Builtin::BI__builtin_elementwise_fshl:
     return RValue::get(
         emitBuiltinWithOneOverloadedType<3>(*this, E, Intrinsic::fshl));
diff --git a/clang/test/CodeGen/strictfp-elementwise-builtins.cpp b/clang/test/CodeGen/strictfp-elementwise-builtins.cpp
index b250512efc5c7..6453d50f044aa 100644
--- a/clang/test/CodeGen/strictfp-elementwise-builtins.cpp
+++ b/clang/test/CodeGen/strictfp-elementwise-builtins.cpp
@@ -68,180 +68,170 @@ float4 strict_elementwise_minimum(float4 a, float4 b) {
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_ceilDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_CEIL:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_CEIL]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float> [[A]], metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_ceil(float4 a) {
   return __builtin_elementwise_ceil(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_acosDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_ACOS:%.*]] = tail call <4 x float> @llvm.acos.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_ACOS]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.acos.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_acos(float4 a) {
   return __builtin_elementwise_acos(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_cosDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_COS:%.*]] = tail call <4 x float> @llvm.cos.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_COS]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.cos.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_cos(float4 a) {
   return __builtin_elementwise_cos(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_coshDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_COSH:%.*]] = tail call <4 x float> @llvm.cosh.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_COSH]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.cosh.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_cosh(float4 a) {
   return __builtin_elementwise_cosh(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_expDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_EXP:%.*]] = tail call <4 x float> @llvm.exp.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_EXP]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.exp.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_exp(float4 a) {
   return __builtin_elementwise_exp(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_exp2Dv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_EXP2:%.*]] = tail call <4 x float> @llvm.exp2.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_EXP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.exp2.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_exp2(float4 a) {
   return __builtin_elementwise_exp2(a);
 }
 
-// CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_exp10Dv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_EXP10:%.*]] = tail call <4 x float> @llvm.exp10.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_EXP10]]
-//
-float4 strict_elementwise_exp10(float4 a) {
-  return __builtin_elementwise_exp10(a);
-}
-
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_floorDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_FLOOR:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_FLOOR]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> [[A]], metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_floor(float4 a) {
   return __builtin_elementwise_floor(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_logDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_LOG:%.*]] = tail call <4 x float> @llvm.log.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_LOG]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.log.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_log(float4 a) {
   return __builtin_elementwise_log(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_log2Dv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_LOG2:%.*]] = tail call <4 x float> @llvm.log2.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_LOG2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.log2.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_log2(float4 a) {
   return __builtin_elementwise_log2(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_log10Dv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_LOG2:%.*]] = tail call <4 x float> @llvm.log2.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_LOG2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.log2.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_log10(float4 a) {
   return __builtin_elementwise_log2(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z28strict_elementwise_roundevenDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_ROUNDEVEN:%.*]] = tail call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_ROUNDEVEN]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> [[A]], metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_roundeven(float4 a) {
   return __builtin_elementwise_roundeven(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_roundDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_ROUND:%.*]] = tail call <4 x float> @llvm.round.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_ROUND]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float> [[A]], metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_round(float4 a) {
   return __builtin_elementwise_round(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_rintDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_RINT:%.*]] = tail call <4 x float> @llvm.rint.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_RINT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.rint.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_rint(float4 a) {
   return __builtin_elementwise_rint(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z28strict_elementwise_nearbyintDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_NEARBYINT:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_NEARBYINT]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_nearbyint(float4 a) {
   return __builtin_elementwise_nearbyint(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_asinDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_ASIN:%.*]] = tail call <4 x float> @llvm.asin.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_ASIN]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.asin.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_asin(float4 a) {
   return __builtin_elementwise_asin(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_sinDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_SIN:%.*]] = tail call <4 x float> @llvm.sin.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_SIN]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.sin.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_sin(float4 a) {
   return __builtin_elementwise_sin(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_sinhDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_SINH:%.*]] = tail call <4 x float> @llvm.sinh.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_SINH]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.sinh.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_sinh(float4 a) {
   return __builtin_elementwise_sinh(a);
@@ -258,79 +248,59 @@ float4 strict_elementwise_sqrt(float4 a) {
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_atanDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_ATAN:%.*]] = tail call <4 x float> @llvm.atan.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_ATAN]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.atan.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_atan(float4 a) {
   return __builtin_elementwise_atan(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_tanDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_TAN:%.*]] = tail call <4 x float> @llvm.tan.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_TAN]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.tan.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_tan(float4 a) {
   return __builtin_elementwise_tan(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_tanhDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_TANH:%.*]] = tail call <4 x float> @llvm.tanh.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_TANH]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.tanh.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_tanh(float4 a) {
   return __builtin_elementwise_tanh(a);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_atan2Dv4_fS_
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_ATAN2:%.*]] = tail call <4 x float> @llvm.atan2.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_ATAN2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.atan2.v4f32(<4 x float> [[A]], <4 x float> [[B]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_atan2(float4 a, float4 b) {
   return __builtin_elementwise_atan2(a, b);
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_truncDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_TRUNC:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_TRUNC]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float> [[A]], metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_trunc(float4 a) {
   return __builtin_elementwise_trunc(a);
 }
 
-// CHECK-LABEL: define dso_local noundef <4 x float> @_Z31strict_elementwise_canonicalizeDv4_f
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ELT_CANONICALIZE:%.*]] = tail call <4 x float> @llvm.canonicalize.v4f32(<4 x float> [[A]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[ELT_CANONICALIZE]]
-//
-float4 strict_elementwise_canonicalize(float4 a) {
-  return __builtin_elementwise_canonicalize(a);
-}
-
-// CHECK-LABEL: define dso_local noundef <4 x float> @_Z27strict_elementwise_copysignDv4_fS_
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
-//
-float4 strict_elementwise_copysign(float4 a, float4 b) {
-  return __builtin_elementwise_copysign(a, b);
-}
-
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_fmaDv4_fS_S_
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_fma(float4 a, float4 b, float4 c) {
@@ -338,9 +308,9 @@ float4 strict_elementwise_fma(float4 a, float4 b, float4 c) {
 }
 
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z22strict_elementwise_powDv4_fS_
-// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.pow.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.pow.v4f32(<4 x float> [[A]], <4 x float> [[B]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
 // CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_pow(float4 a, float4 b) {
@@ -350,8 +320,8 @@ float4 strict_elementwise_pow(float4 a, float4 b) {
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_fmodDv4_fS_
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[FMOD:%.*]] = tail call <4 x float> @llvm.experimental.constrained.frem.v4f32(<4 x float> [[A]], <4 x float> [[B]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
-// CHECK-NEXT:    ret <4 x float> [[FMOD]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.frem.v4f32(<4 x float> [[A]], <4 x float> [[B]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float4 strict_elementwise_fmod(float4 a, float4 b) {
   return __builtin_elementwise_fmod(a, b);
diff --git a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
index df34beeba7a8c..c22f012421e3a 100644
--- a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
@@ -3,86 +3,86 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_double
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// CHECK: ret float %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// CHECK: ret float [[EXP]]
 float test_exp_double(double p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
-// CHECK: ret <2 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
+// CHECK: ret <2 x float> [[EXP]]
 float2 test_exp_double2(double2 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
-// CHECK: ret <3 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
+// CHECK: ret <3 x float> [[EXP]]
 float3 test_exp_double3(double3 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
-// CHECK: ret <4 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
+// CHECK: ret <4 x float> [[EXP]]
 float4 test_exp_double4(double4 p0) { return exp(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// CHECK: ret float %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// CHECK: ret float [[EXP]]
 float test_exp_int(int p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
-// CHECK: ret <2 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
+// CHECK: ret <2 x float> [[EXP]]
 float2 test_exp_int2(int2 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
-// CHECK: ret <3 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
+// CHECK: ret <3 x float> [[EXP]]
 float3 test_exp_int3(int3 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
-// CHECK: ret <4 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
+// CHECK: ret <4 x float> [[EXP]]
 float4 test_exp_int4(int4 p0) { return exp(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// CHECK: ret float %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// CHECK: ret float [[EXP]]
 float test_exp_uint(uint p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
-// CHECK: ret <2 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
+// CHECK: ret <2 x float> [[EXP]]
 float2 test_exp_uint2(uint2 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
-// CHECK: ret <3 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
+// CHECK: ret <3 x float> [[EXP]]
 float3 test_exp_uint3(uint3 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
-// CHECK: ret <4 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
+// CHECK: ret <4 x float> [[EXP]]
 float4 test_exp_uint4(uint4 p0) { return exp(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// CHECK: ret float %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// CHECK: ret float [[EXP]]
 float test_exp_int64_t(int64_t p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
-// CHECK: ret <2 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
+// CHECK: ret <2 x float> [[EXP]]
 float2 test_exp_int64_t2(int64_t2 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
-// CHECK: ret <3 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
+// CHECK: ret <3 x float> [[EXP]]
 float3 test_exp_int64_t3(int64_t3 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
-// CHECK: ret <4 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
+// CHECK: ret <4 x float> [[EXP]]
 float4 test_exp_int64_t4(int64_t4 p0) { return exp(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// CHECK: ret float %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// CHECK: ret float [[EXP]]
 float test_exp_uint64_t(uint64_t p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
-// CHECK: ret <2 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
+// CHECK: ret <2 x float> [[EXP]]
 float2 test_exp_uint64_t2(uint64_t2 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
-// CHECK: ret <3 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
+// CHECK: ret <3 x float> [[EXP]]
 float3 test_exp_uint64_t3(uint64_t3 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
-// CHECK: ret <4 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
+// CHECK: ret <4 x float> [[EXP]]
 float4 test_exp_uint64_t4(uint64_t4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index d50ef021eecb8..56efb03d1f98b 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -6,47 +6,47 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_exp_half
-// NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(
-// NATIVE_HALF: ret half %elt.exp
+// NATIVE_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(
+// NATIVE_HALF: ret half [[EXP]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_exp_half
-// NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// NO_HALF: ret float %elt.exp
+// NO_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// NO_HALF: ret float [[EXP]]
 half test_exp_half(half p0) { return exp(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
-// NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp.v2f16
-// NATIVE_HALF: ret <2 x half> %elt.exp
+// NATIVE_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp.v2f16
+// NATIVE_HALF: ret <2 x half> [[EXP]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
-// NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32(
-// NO_HALF: ret <2 x float> %elt.exp
+// NO_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32(
+// NO_HALF: ret <2 x float> [[EXP]]
 half2 test_exp_half2(half2 p0) { return exp(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
-// NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp.v3f16
-// NATIVE_HALF: ret <3 x half> %elt.exp
+// NATIVE_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp.v3f16
+// NATIVE_HALF: ret <3 x half> [[EXP]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
-// NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32(
-// NO_HALF: ret <3 x float> %elt.exp
+// NO_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32(
+// NO_HALF: ret <3 x float> [[EXP]]
 half3 test_exp_half3(half3 p0) { return exp(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
-// NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp.v4f16
-// NATIVE_HALF: ret <4 x half> %elt.exp
+// NATIVE_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp.v4f16
+// NATIVE_HALF: ret <4 x half> [[EXP]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
-// NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32(
-// NO_HALF: ret <4 x float> %elt.exp
+// NO_HALF: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32(
+// NO_HALF: ret <4 x float> [[EXP]]
 half4 test_exp_half4(half4 p0) { return exp(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp_float
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
-// CHECK: ret float %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
+// CHECK: ret float [[EXP:%.*]]
 float test_exp_float(float p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
-// CHECK: ret <2 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
+// CHECK: ret <2 x float> [[EXP]]
 float2 test_exp_float2(float2 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
-// CHECK: ret <3 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
+// CHECK: ret <3 x float> [[EXP]]
 float3 test_exp_float3(float3 p0) { return exp(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
-// CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
-// CHECK: ret <4 x float> %elt.exp
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
+// CHECK: ret <4 x float> [[EXP]]
 float4 test_exp_float4(float4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
index 20482777a18de..a8a6f3ba76b4f 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
@@ -3,86 +3,86 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_double
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// CHECK: ret float %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// CHECK: ret float [[EXP2]]
 float test_exp2_double(double p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
-// CHECK: ret <2 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
+// CHECK: ret <2 x float> [[EXP2]]
 float2 test_exp2_double2(double2 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
-// CHECK: ret <3 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
+// CHECK: ret <3 x float> [[EXP2]]
 float3 test_exp2_double3(double3 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
-// CHECK: ret <4 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
+// CHECK: ret <4 x float> [[EXP2]]
 float4 test_exp2_double4(double4 p0) { return exp2(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// CHECK: ret float %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// CHECK: ret float [[EXP2]]
 float test_exp2_int(int p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
-// CHECK: ret <2 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
+// CHECK: ret <2 x float> [[EXP2]]
 float2 test_exp2_int2(int2 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
-// CHECK: ret <3 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
+// CHECK: ret <3 x float> [[EXP2]]
 float3 test_exp2_int3(int3 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
-// CHECK: ret <4 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
+// CHECK: ret <4 x float> [[EXP2]]
 float4 test_exp2_int4(int4 p0) { return exp2(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// CHECK: ret float %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// CHECK: ret float [[EXP2]]
 float test_exp2_uint(uint p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
-// CHECK: ret <2 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
+// CHECK: ret <2 x float> [[EXP2]]
 float2 test_exp2_uint2(uint2 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
-// CHECK: ret <3 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
+// CHECK: ret <3 x float> [[EXP2]]
 float3 test_exp2_uint3(uint3 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
-// CHECK: ret <4 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
+// CHECK: ret <4 x float> [[EXP2]]
 float4 test_exp2_uint4(uint4 p0) { return exp2(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// CHECK: ret float %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// CHECK: ret float [[EXP2]]
 float test_exp2_int64_t(int64_t p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
-// CHECK: ret <2 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
+// CHECK: ret <2 x float> [[EXP2]]
 float2 test_exp2_int64_t2(int64_t2 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
-// CHECK: ret <3 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
+// CHECK: ret <3 x float> [[EXP2]]
 float3 test_exp2_int64_t3(int64_t3 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
-// CHECK: ret <4 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
+// CHECK: ret <4 x float> [[EXP2]]
 float4 test_exp2_int64_t4(int64_t4 p0) { return exp2(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// CHECK: ret float %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// CHECK: ret float [[EXP2]]
 float test_exp2_uint64_t(uint64_t p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
-// CHECK: ret <2 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
+// CHECK: ret <2 x float> [[EXP2]]
 float2 test_exp2_uint64_t2(uint64_t2 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
-// CHECK: ret <3 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
+// CHECK: ret <3 x float> [[EXP2]]
 float3 test_exp2_uint64_t3(uint64_t3 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
-// CHECK: ret <4 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
+// CHECK: ret <4 x float> [[EXP2]]
 float4 test_exp2_uint64_t4(uint64_t4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index ed8cfcf47b04b..b4d9c411681d1 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -6,47 +6,47 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_exp2_half
-// NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(
-// NATIVE_HALF: ret half %elt.exp2
+// NATIVE_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(
+// NATIVE_HALF: ret half [[EXP2]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp2_half
-// NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// NO_HALF: ret float %elt.exp2
+// NO_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// NO_HALF: ret float [[EXP2]]
 half test_exp2_half(half p0) { return exp2(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
-// NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16
-// NATIVE_HALF: ret <2 x half> %elt.exp2
+// NATIVE_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16
+// NATIVE_HALF: ret <2 x half> [[EXP2]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
-// NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(
-// NO_HALF: ret <2 x float> %elt.exp2
+// NO_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(
+// NO_HALF: ret <2 x float> [[EXP2]]
 half2 test_exp2_half2(half2 p0) { return exp2(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
-// NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16
-// NATIVE_HALF: ret <3 x half> %elt.exp2
+// NATIVE_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16
+// NATIVE_HALF: ret <3 x half> [[EXP2]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
-// NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(
-// NO_HALF: ret <3 x float> %elt.exp2
+// NO_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(
+// NO_HALF: ret <3 x float> [[EXP2]]
 half3 test_exp2_half3(half3 p0) { return exp2(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
-// NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16
-// NATIVE_HALF: ret <4 x half> %elt.exp2
+// NATIVE_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16
+// NATIVE_HALF: ret <4 x half> [[EXP2]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
-// NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(
-// NO_HALF: ret <4 x float> %elt.exp2
+// NO_HALF: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(
+// NO_HALF: ret <4 x float> [[EXP2]]
 half4 test_exp2_half4(half4 p0) { return exp2(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_exp2_float
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
-// CHECK: ret float %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
+// CHECK: ret float [[EXP2]]
 float test_exp2_float(float p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
-// CHECK: ret <2 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
+// CHECK: ret <2 x float> [[EXP2]]
 float2 test_exp2_float2(float2 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
-// CHECK: ret <3 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
+// CHECK: ret <3 x float> [[EXP2]]
 float3 test_exp2_float3(float3 p0) { return exp2(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
-// CHECK: ret <4 x float> %elt.exp2
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
+// CHECK: ret <4 x float> [[EXP2]]
 float4 test_exp2_float4(float4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
index 012adc588ddfa..2dec126788956 100644
--- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
@@ -1,49 +1,49 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -fnative-int16-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn half %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn half [[EXP2]], %{{.*}}
 // CHECK: ret half %mul
 half test_ldexp_half(half X, half Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16(<2 x half> %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x half> %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16(<2 x half> %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[EXP2]], %{{.*}}
 // CHECK: ret <2 x half> %mul
 half2 test_ldexp_half2(half2 X, half2 Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16(<3 x half> %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x half> %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16(<3 x half> %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[EXP2]], %{{.*}}
 // CHECK: ret <3 x half> %mul
 half3 test_ldexp_half3(half3 X, half3 Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16(<4 x half> %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x half> %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16(<4 x half> %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[EXP2]], %{{.*}}
 // CHECK: ret <4 x half> %mul
 half4 test_ldexp_half4(half4 X, half4 Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(float %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn float %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(float %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn float [[EXP2]], %{{.*}}
 // CHECK: ret float %mul
 float test_ldexp_float(float X, float Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(<2 x float> %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x float> %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(<2 x float> %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[EXP2]], %{{.*}}
 // CHECK: ret <2 x float> %mul
 float2 test_ldexp_float2(float2 X, float2 Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(<3 x float> %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x float> %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(<3 x float> %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[EXP2]], %{{.*}}
 // CHECK: ret <3 x float> %mul
 float3 test_ldexp_float3(float3 X, float3 Exp) { return ldexp(X, Exp); }
 
 // CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
-// CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(<4 x float> %{{.*}})
-// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x float> %elt.exp2, %{{.*}}
+// CHECK: [[EXP2:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(<4 x float> %{{.*}})
+// CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[EXP2]], %{{.*}}
 // CHECK: ret <4 x float> %mul
 float4 test_ldexp_float4(float4 X, float4 Exp) { return ldexp(X, Exp); }
diff --git a/clang/test/CodeGenHLSL/builtins/lit.hlsl b/clang/test/CodeGenHLSL/builtins/lit.hlsl
index c0b109a75906b..b7979960de9f6 100644
--- a/clang/test/CodeGenHLSL/builtins/lit.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lit.hlsl
@@ -7,9 +7,9 @@
 // CHECK: %vecinit2.i = insertelement <4 x half> %{{.*}}, half 0xH3C00, i32 3
 // CHECK: %cmp4.i = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000
 // CHECK: %hlsl.or.i = or i1 %{{.*}}, %cmp4.i
-// CHECK: %elt.log.i = call reassoc nnan ninf nsz arcp afn half @llvm.log.f16(half %{{.*}})
-// CHECK: %mul.i = fmul reassoc nnan ninf nsz arcp afn half %elt.log.i, %{{.*}}
-// CHECK: %elt.exp.i = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(half %mul.i)
+// CHECK: [[LOG:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.log.f16(half %{{.*}})
+// CHECK: %mul.i = fmul reassoc nnan ninf nsz arcp afn half [[LOG]], %{{.*}}
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(half %mul.i)
 // CHECK: %hlsl.select7.i = select reassoc nnan ninf nsz arcp afn i1 %{{.*}}, half 0xH0000, half %{{.*}}
 // CHECK: %vecins.i = insertelement <4 x half> %{{.*}}, half %hlsl.select7.i, i32 2
 // CHECK: ret <4 x half> %{{.*}}
@@ -22,9 +22,9 @@ half4 test_lit_half(half NDotL, half NDotH, half M) { return lit(NDotL, NDotH, M
 // CHECK: %vecinit2.i = insertelement <4 x float> %{{.*}}, float 1.000000e+00, i32 3
 // CHECK: %cmp4.i = fcmp reassoc nnan ninf nsz arcp afn olt float %{{.*}}, 0.000000e+00
 // CHECK: %hlsl.or.i = or i1 %{{.*}}, %cmp4.i
-// CHECK: %elt.log.i = call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(float %{{.*}})
-// CHECK: %mul.i = fmul reassoc nnan ninf nsz arcp afn float %elt.log.i, %{{.*}}
-// CHECK: %elt.exp.i = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(float %mul.i)
+// CHECK: [[LOG:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(float %{{.*}})
+// CHECK: %mul.i = fmul reassoc nnan ninf nsz arcp afn float [[LOG]], %{{.*}}
+// CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(float %mul.i)
 // CHECK: %hlsl.select7.i = select reassoc nnan ninf nsz arcp afn i1 %{{.*}}, float 0.000000e+00, float %{{.*}}
 // CHECK: %vecins.i = insertelement <4 x float> %{{.*}}, float %hlsl.select7.i, i32 2
 // CHECK: ret <4 x float> %{{.*}}
diff --git a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
index 3b07fcec064d8..5719d9d92991e 100644
--- a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
@@ -3,86 +3,86 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_double
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// CHECK: ret float %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// CHECK: ret float [[ROUNDEVEN]]
 float test_round_double(double p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
-// CHECK: ret <2 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
+// CHECK: ret <2 x float> [[ROUNDEVEN]]
 float2 test_round_double2(double2 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
-// CHECK: ret <3 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
+// CHECK: ret <3 x float> [[ROUNDEVEN]]
 float3 test_round_double3(double3 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
-// CHECK: ret <4 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
+// CHECK: ret <4 x float> [[ROUNDEVEN]]
 float4 test_round_double4(double4 p0) { return round(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// CHECK: ret float %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// CHECK: ret float [[ROUNDEVEN]]
 float test_round_int(int p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
-// CHECK: ret <2 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
+// CHECK: ret <2 x float> [[ROUNDEVEN]]
 float2 test_round_int2(int2 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
-// CHECK: ret <3 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
+// CHECK: ret <3 x float> [[ROUNDEVEN]]
 float3 test_round_int3(int3 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
-// CHECK: ret <4 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
+// CHECK: ret <4 x float> [[ROUNDEVEN]]
 float4 test_round_int4(int4 p0) { return round(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// CHECK: ret float %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// CHECK: ret float [[ROUNDEVEN]]
 float test_round_uint(uint p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
-// CHECK: ret <2 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
+// CHECK: ret <2 x float> [[ROUNDEVEN]]
 float2 test_round_uint2(uint2 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
-// CHECK: ret <3 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
+// CHECK: ret <3 x float> [[ROUNDEVEN]]
 float3 test_round_uint3(uint3 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
-// CHECK: ret <4 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
+// CHECK: ret <4 x float> [[ROUNDEVEN]]
 float4 test_round_uint4(uint4 p0) { return round(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// CHECK: ret float %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// CHECK: ret float [[ROUNDEVEN]]
 float test_round_int64_t(int64_t p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
-// CHECK: ret <2 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
+// CHECK: ret <2 x float> [[ROUNDEVEN]]
 float2 test_round_int64_t2(int64_t2 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
-// CHECK: ret <3 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
+// CHECK: ret <3 x float> [[ROUNDEVEN]]
 float3 test_round_int64_t3(int64_t3 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
-// CHECK: ret <4 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
+// CHECK: ret <4 x float> [[ROUNDEVEN]]
 float4 test_round_int64_t4(int64_t4 p0) { return round(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// CHECK: ret float %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// CHECK: ret float [[ROUNDEVEN]]
 float test_round_uint64_t(uint64_t p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
-// CHECK: ret <2 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
+// CHECK: ret <2 x float> [[ROUNDEVEN]]
 float2 test_round_uint64_t2(uint64_t2 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
-// CHECK: ret <3 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
+// CHECK: ret <3 x float> [[ROUNDEVEN]]
 float3 test_round_uint64_t3(uint64_t3 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
-// CHECK: ret <4 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
+// CHECK: ret <4 x float> [[ROUNDEVEN]]
 float4 test_round_uint64_t4(uint64_t4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index 0d4afee6ba9a8..8161b0c1c3256 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -6,47 +6,47 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_round_half
-// NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn half @llvm.roundeven.f16(
-// NATIVE_HALF: ret half %elt.roundeven
+// NATIVE_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.roundeven.f16(
+// NATIVE_HALF: ret half [[ROUNDEVEN]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_round_half
-// NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// NO_HALF: ret float %elt.roundeven
+// NO_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// NO_HALF: ret float [[ROUNDEVEN]]
 half test_round_half(half p0) { return round(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
-// NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.roundeven.v2f16
-// NATIVE_HALF: ret <2 x half> %elt.roundeven
+// NATIVE_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.roundeven.v2f16
+// NATIVE_HALF: ret <2 x half> [[ROUNDEVEN:%.*]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
-// NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32(
-// NO_HALF: ret <2 x float> %elt.roundeven
+// NO_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32(
+// NO_HALF: ret <2 x float> [[ROUNDEVEN]]
 half2 test_round_half2(half2 p0) { return round(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
-// NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.roundeven.v3f16
-// NATIVE_HALF: ret <3 x half> %elt.roundeven
+// NATIVE_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.roundeven.v3f16
+// NATIVE_HALF: ret <3 x half> [[ROUNDEVEN:%.*]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
-// NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32(
-// NO_HALF: ret <3 x float> %elt.roundeven
+// NO_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32(
+// NO_HALF: ret <3 x float> [[ROUNDEVEN]]
 half3 test_round_half3(half3 p0) { return round(p0); }
 // NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
-// NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.roundeven.v4f16
-// NATIVE_HALF: ret <4 x half> %elt.roundeven
+// NATIVE_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.roundeven.v4f16
+// NATIVE_HALF: ret <4 x half> [[ROUNDEVEN:%.*]]
 // NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
-// NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32(
-// NO_HALF: ret <4 x float> %elt.roundeven
+// NO_HALF: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32(
+// NO_HALF: ret <4 x float> [[ROUNDEVEN]]
 half4 test_round_half4(half4 p0) { return round(p0); }
 
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_round_float
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
-// CHECK: ret float %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
+// CHECK: ret float [[ROUNDEVEN]]
 float test_round_float(float p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
-// CHECK: ret <2 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
+// CHECK: ret <2 x float> [[ROUNDEVEN]]
 float2 test_round_float2(float2 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
-// CHECK: ret <3 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
+// CHECK: ret <3 x float> [[ROUNDEVEN]]
 float3 test_round_float3(float3 p0) { return round(p0); }
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
-// CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
-// CHECK: ret <4 x float> %elt.roundeven
+// CHECK: [[ROUNDEVEN:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
+// CHECK: ret <4 x float> [[ROUNDEVEN]]
 float4 test_round_float4(float4 p0) { return round(p0); }

From 47c1aa4cef638c97b74f3afb7bed60e92bba1f90 Mon Sep 17 00:00:00 2001
From: Ahmed Nour <ahmednour.mohamed2012@gmail.com>
Date: Mon, 17 Nov 2025 17:34:34 +0200
Subject: [PATCH 024/105] [X86] Add constexpr support for addsub intrinsics 
 (#167512)

Recent commits (7fe069121b57a, 53ddeb493529a) marked several x86
intrinsics as constexpr in headers without providing the necessary
constant evaluation support in the compiler backend. This caused
compilation failures when attempting to use these intrinsics in constant
expressions.

Resolves #166814
Resolves #161203
---
 clang/include/clang/Basic/BuiltinsX86.td | 18 ++++++++++---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 34 ++++++++++++++++++++++++
 clang/lib/AST/ExprConstant.cpp           | 29 ++++++++++++++++++++
 clang/lib/Headers/avxintrin.h            | 10 +++----
 clang/lib/Headers/pmmintrin.h            |  7 +++--
 clang/test/CodeGen/X86/avx-builtins.c    |  2 ++
 clang/test/CodeGen/X86/sse3-builtins.c   |  2 ++
 7 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index bbe0aa3657c06..a656fe341c8e0 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -92,8 +92,8 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
     def cmpsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant char)">;
   }
 
-
-  let Features = "sse3" in {
+  let Features = "sse3",
+      Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
     foreach Op = ["addsub"] in {
       def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
       def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
@@ -121,8 +121,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
 }
 
 // AVX
-let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in {
-  foreach Op = ["addsub", "max", "min"] in {
+let Attributes = [Const, NoThrow, RequiredVectorWidth<256>],
+    Features = "avx" in {
+  foreach Op = ["max", "min"] in {
     def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
     def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
@@ -571,6 +572,15 @@ let Features = "avx",
   def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
 }
 
+let Features = "avx",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def addsubpd256
+      : X86Builtin<
+            "_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
+  def addsubps256
+      : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
+}
+
 let Features = "avx", Attributes = [NoThrow] in {
   def vzeroall : X86Builtin<"void()">;
   def vzeroupper : X86Builtin<"void()">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a2f99c7c234fe..30426565407ba 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2715,6 +2715,35 @@ static bool interp_builtin_horizontal_fp_binop(
   return true;
 }
 
+static bool interp__builtin_ia32_addsub(InterpState &S, CodePtr OpPC,
+                                        const CallExpr *Call) {
+  // Addsub: alternates between subtraction and addition
+  // Result[i] = (i % 2 == 0) ? (a[i] - b[i]) : (a[i] + b[i])
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned NumElems = VT->getNumElements();
+
+  using T = PrimConv<PT_Float>::T;
+  for (unsigned I = 0; I != NumElems; ++I) {
+    APFloat LElem = LHS.elem<T>(I).getAPFloat();
+    APFloat RElem = RHS.elem<T>(I).getAPFloat();
+    if (I % 2 == 0) {
+      // Even indices: subtract
+      LElem.subtract(RElem, RM);
+    } else {
+      // Odd indices: add
+      LElem.add(RElem, RM);
+    }
+    Dst.elem<T>(I) = static_cast<T>(LElem);
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_elementwise_triop_fp(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
@@ -4196,6 +4225,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           F.subtract(RHS, RM);
           return F;
         });
+  case clang::X86::BI__builtin_ia32_addsubpd:
+  case clang::X86::BI__builtin_ia32_addsubps:
+  case clang::X86::BI__builtin_ia32_addsubpd256:
+  case clang::X86::BI__builtin_ia32_addsubps256:
+    return interp__builtin_ia32_addsub(S, OpPC, Call);
 
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index d9b3ee20e919f..ed1f1b7508ffc 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -13373,6 +13373,35 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     }
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case clang::X86::BI__builtin_ia32_addsubpd:
+  case clang::X86::BI__builtin_ia32_addsubps:
+  case clang::X86::BI__builtin_ia32_addsubpd256:
+  case clang::X86::BI__builtin_ia32_addsubps256: {
+    // Addsub: alternates between subtraction and addition
+    // Result[i] = (i % 2 == 0) ? (a[i] - b[i]) : (a[i] + b[i])
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned NumElems = SourceLHS.getVectorLength();
+    SmallVector<APValue, 8> ResultElements;
+    ResultElements.reserve(NumElems);
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
+
+    for (unsigned I = 0; I != NumElems; ++I) {
+      APFloat LHS = SourceLHS.getVectorElt(I).getFloat();
+      APFloat RHS = SourceRHS.getVectorElt(I).getFloat();
+      if (I % 2 == 0) {
+        // Even indices: subtract
+        LHS.subtract(RHS, RM);
+      } else {
+        // Odd indices: add
+        LHS.add(RHS, RM);
+      }
+      ResultElements.push_back(APValue(LHS));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_fshl:
   case Builtin::BI__builtin_elementwise_fshr: {
     APValue SourceHi, SourceLo, SourceShift;
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 4aef9245323fb..3e1618ed192c8 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -147,9 +147,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_sub_ps(__m256 __a,
 ///    A 256-bit vector of [4 x double] containing the right source operand.
 /// \returns A 256-bit vector of [4 x double] containing the alternating sums
 ///    and differences between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_addsub_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_addsub_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -166,9 +165,8 @@ _mm256_addsub_pd(__m256d __a, __m256d __b)
 ///    A 256-bit vector of [8 x float] containing the right source operand.
 /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
 ///    differences between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_addsub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_addsub_ps(__m256 __a, __m256 __b) {
   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 6b152bde29fc1..a9a65440363c3 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -60,9 +60,8 @@ _mm_lddqu_si128(__m128i_u const *__p)
 ///    A 128-bit vector of [4 x float] containing the right source operand.
 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
 ///    differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_addsub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_addsub_ps(__m128 __a, __m128 __b) {
   return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
 }
 
@@ -166,7 +165,7 @@ _mm_moveldup_ps(__m128 __a)
 ///    A 128-bit vector of [2 x double] containing the right source operand.
 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 ///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_addsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 737febbc7fef6..46bc28b85d8db 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -45,12 +45,14 @@ __m256d test_mm256_addsub_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_addsub_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_addsub_pd((__m256d){+1.0, +2.0, +3.0, +4.0}, (__m256d){+1.0, +1.0, +1.0, +1.0}), +0.0, +3.0, +2.0, +5.0));
 
 __m256 test_mm256_addsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_addsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_addsub_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256(_mm256_addsub_ps((__m256){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}, (__m256){+1.0f, +1.0f, +1.0f, +1.0f, +1.0f, +1.0f, +1.0f, +1.0f}), +0.0f, +3.0f, +2.0f, +5.0f, +4.0f, +7.0f, +6.0f, +9.0f));
 
 __m256d test_mm256_and_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_and_pd
diff --git a/clang/test/CodeGen/X86/sse3-builtins.c b/clang/test/CodeGen/X86/sse3-builtins.c
index a82dd4080670b..44389fbdc6f77 100644
--- a/clang/test/CodeGen/X86/sse3-builtins.c
+++ b/clang/test/CodeGen/X86/sse3-builtins.c
@@ -19,12 +19,14 @@ __m128d test_mm_addsub_pd(__m128d A, __m128d B) {
   // CHECK: call {{.*}}<2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_addsub_pd(A, B);
 }
+TEST_CONSTEXPR(match_m128d(_mm_addsub_pd((__m128d){+2.0, +2.0}, (__m128d){+1.0, +2.0}), +1.0, +4.0));
 
 __m128 test_mm_addsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_addsub_ps
   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_addsub_ps(A, B);
 }
+TEST_CONSTEXPR(match_m128(_mm_addsub_ps((__m128){+3.0f, +4.0f, +5.0f, +6.0f}, (__m128){+1.0f, +2.0f, +3.0f, +4.0f}), +2.0f, +6.0f, +2.0f, +10.0f));
 
 __m128d test_mm_hadd_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hadd_pd

From 17cbb48c49a8d4408f7afa088f9c8a30be567a75 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 10:43:31 -0700
Subject: [PATCH 025/105] [MLIR] Apply clang-tidy fixes for
 readability-identifier-naming in Parser.cpp (NFC)

---
 mlir/lib/Query/Matcher/Parser.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Query/Matcher/Parser.cpp b/mlir/lib/Query/Matcher/Parser.cpp
index e392a885c511b..7bfe03d02a7fa 100644
--- a/mlir/lib/Query/Matcher/Parser.cpp
+++ b/mlir/lib/Query/Matcher/Parser.cpp
@@ -27,7 +27,7 @@ struct Parser::TokenInfo {
   }
 
   // Known identifiers.
-  static const char *const ID_Extract;
+  static const char *const idExtract;
 
   llvm::StringRef text;
   TokenKind kind = TokenKind::Eof;
@@ -35,7 +35,7 @@ struct Parser::TokenInfo {
   VariantValue value;
 };
 
-const char *const Parser::TokenInfo::ID_Extract = "extract";
+const char *const Parser::TokenInfo::idExtract = "extract";
 
 class Parser::CodeTokenizer {
 public:
@@ -452,13 +452,13 @@ bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
     }
 
     if (chainCallToken.kind != TokenKind::Ident ||
-        chainCallToken.text != TokenInfo::ID_Extract) {
+        chainCallToken.text != TokenInfo::idExtract) {
       error->addError(chainCallToken.range,
                       ErrorType::ParserMalformedChainedExpr);
       return false;
     }
 
-    if (chainCallToken.text == TokenInfo::ID_Extract &&
+    if (chainCallToken.text == TokenInfo::idExtract &&
         !parseChainedExpression(functionName))
       return false;
   }

From 38811bea5a567b8b848735af7ed6bacd52d3a3dc Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Mon, 17 Nov 2025 21:24:54 +0530
Subject: [PATCH 026/105] [Flang] [OpenMP] Add support for spaces in between
 the name (#168311)

Supports the fixed form syntax which has spaces in between the
identifier
---
 flang/lib/Parser/prescan.cpp               |  3 +++
 flang/lib/Parser/prescan.h                 |  3 +++
 flang/test/Parser/OpenMP/name-with-space.f | 15 +++++++++++++++
 3 files changed, 21 insertions(+)
 create mode 100644 flang/test/Parser/OpenMP/name-with-space.f

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 8cccd84f9fa19..5e8d50be277a0 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -845,6 +845,9 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
     if (InFixedFormSource()) {
       SkipSpaces();
     }
+    if (inFixedForm_ && (IsOpenMPDirective() && parenthesisNesting_ > 0)) {
+      SkipSpaces();
+    }
     if ((*at_ == '\'' || *at_ == '"') &&
         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
       QuotedCharacterLiteral(tokens, start);
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index 5e7481781d944..4f691d56975a9 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -183,6 +183,9 @@ class Prescanner {
   bool InConditionalLine() const {
     return InOpenMPConditionalLine() || InOpenACCOrCUDAConditionalLine();
   }
+  bool IsOpenMPDirective() const {
+    return directiveSentinel_ && std::strcmp(directiveSentinel_, "$omp") == 0;
+  }
   bool InFixedFormSource() const {
     return inFixedForm_ && !inPreprocessorDirective_ && !InCompilerDirective();
   }
diff --git a/flang/test/Parser/OpenMP/name-with-space.f b/flang/test/Parser/OpenMP/name-with-space.f
new file mode 100644
index 0000000000000..603ebc40c9f4c
--- /dev/null
+++ b/flang/test/Parser/OpenMP/name-with-space.f
@@ -0,0 +1,15 @@
+! RUN: %flang_fc1 -fopenmp -fdebug-unparse-no-sema %s 2>&1 | FileCheck %s
+
+        program name_with_space
+!CHECK: !$OMP THREADPRIVATE(/cc/, var1)
+!$omp threadprivate(/c c/, var 1)
+
+!CHECK: !$OMP PARALLEL PRIVATE(somevar,expr1,expr2) IF(expr2>expr1)
+!$omp parallel private(some var, expr 1, ex pr2)
+!$omp+ if (exp r2 > ex pr1)
+!$omp critical (x_x)
+        print '(a)', 'Hello World'
+!CHECK: !$OMP END CRITICAL(x_x)
+!$omp end critical (x _x)
+!$omp end parallel
+        end program name_with_space

From d65be16ab6adf00af21e75d29049ae5de0f3a38a Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan@arm.com>
Date: Mon, 17 Nov 2025 15:55:40 +0000
Subject: [PATCH 027/105] [AArch64][GlobalISel] Add combine for
 build_vector(unmerge, unmerge, undef, undef) (#165539)

This PR adds a new combine to the `post-legalizer-combiner` pass. The
new combine checks for vectors being unmerged and subsequently padded
with `G_IMPLICIT_DEF` values by building a new vector. If such a case is
found, the vector being unmerged is instead just concatenated with a
`G_IMPLICIT_DEF` that is as wide as the vector being unmerged.

This removes unnecessary `mov` instructions in a few places.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  6 ++
 .../include/llvm/Target/GlobalISel/Combine.td | 12 ++-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 84 ++++++++++++++++++-
 llvm/test/CodeGen/AArch64/fptrunc.ll          | 18 ++--
 llvm/test/CodeGen/AArch64/itofp.ll            | 48 +++--------
 .../build-vector-packed-partial-undef.ll      | 54 ++++--------
 6 files changed, 131 insertions(+), 91 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 96cb7cdf2d531..9de1a643f1000 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -647,6 +647,12 @@ class CombinerHelper {
   bool matchRotateOutOfRange(MachineInstr &MI) const;
   void applyRotateOutOfRange(MachineInstr &MI) const;
 
+  bool matchCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                Register &UnmergeSrc) const;
+  void applyCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                MachineIRBuilder &B,
+                                Register &UnmergeSrc) const;
+
   bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo) const;
   void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo) const;
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 119695e53c3cb..0ab2d9487a295 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -921,6 +921,15 @@ def merge_of_x_and_zero : GICombineRule <
          [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
 
+// Transform build_vector(unmerge(src, 0), ... unmerge(src, n), undef, ..., undef)
+// => concat_vectors(src, undef)
+def combine_build_unmerge : GICombineRule<
+  (defs root:$root, register_matchinfo:$unmergeSrc),
+  (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root,
+         [{ return Helper.matchCombineBuildUnmerge(*${root}, MRI, ${unmergeSrc}); }]),
+  (apply [{ Helper.applyCombineBuildUnmerge(*${root}, MRI, B, ${unmergeSrc}); }])
+>;
+
 def merge_combines: GICombineGroup<[
   unmerge_anyext_build_vector,
   unmerge_merge,
@@ -930,7 +939,8 @@ def merge_combines: GICombineGroup<[
   unmerge_dead_to_trunc,
   unmerge_zext_to_zext,
   merge_of_x_and_undef,
-  merge_of_x_and_zero
+  merge_of_x_and_zero,
+  combine_build_unmerge
 ]>;
 
 // Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index ec4d13f1cd1b3..45a08347b1ec2 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -3463,6 +3463,88 @@ static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits,
          isConstTrueVal(TLI, Cst, IsVector, IsFP);
 }
 
+// This pattern aims to match the following shape to avoid extra mov
+// instructions
+// G_BUILD_VECTOR(
+//   G_UNMERGE_VALUES(src, 0)
+//   G_UNMERGE_VALUES(src, 1)
+//   G_IMPLICIT_DEF
+//   G_IMPLICIT_DEF
+// )
+// ->
+// G_CONCAT_VECTORS(
+//   src,
+//   undef
+// )
+bool CombinerHelper::matchCombineBuildUnmerge(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              Register &UnmergeSrc) const {
+  auto &BV = cast<GBuildVector>(MI);
+
+  unsigned BuildUseCount = BV.getNumSources();
+  if (BuildUseCount % 2 != 0)
+    return false;
+
+  unsigned NumUnmerge = BuildUseCount / 2;
+
+  auto *Unmerge = getOpcodeDef<GUnmerge>(BV.getSourceReg(0), MRI);
+
+  // Check the first operand is an unmerge and has the correct number of
+  // operands
+  if (!Unmerge || Unmerge->getNumDefs() != NumUnmerge)
+    return false;
+
+  UnmergeSrc = Unmerge->getSourceReg();
+
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT UnmergeSrcTy = MRI.getType(UnmergeSrc);
+
+  // Ensure we only generate legal instructions post-legalizer
+  if (!IsPreLegalize &&
+      !isLegal({TargetOpcode::G_CONCAT_VECTORS, {DstTy, UnmergeSrcTy}}))
+    return false;
+
+  // Check that all of the operands before the midpoint come from the same
+  // unmerge and are in the same order as they are used in the build_vector
+  for (unsigned I = 0; I < NumUnmerge; ++I) {
+    auto MaybeUnmergeReg = BV.getSourceReg(I);
+    auto *LoopUnmerge = getOpcodeDef<GUnmerge>(MaybeUnmergeReg, MRI);
+
+    if (!LoopUnmerge || LoopUnmerge != Unmerge)
+      return false;
+
+    if (LoopUnmerge->getOperand(I).getReg() != MaybeUnmergeReg)
+      return false;
+  }
+
+  // Check that all of the unmerged values are used
+  if (Unmerge->getNumDefs() != NumUnmerge)
+    return false;
+
+  // Check that all of the operands after the mid point are undefs.
+  for (unsigned I = NumUnmerge; I < BuildUseCount; ++I) {
+    auto *Undef = getDefIgnoringCopies(BV.getSourceReg(I), MRI);
+
+    if (Undef->getOpcode() != TargetOpcode::G_IMPLICIT_DEF)
+      return false;
+  }
+
+  return true;
+}
+
+void CombinerHelper::applyCombineBuildUnmerge(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B,
+                                              Register &UnmergeSrc) const {
+  assert(UnmergeSrc && "Expected there to be one matching G_UNMERGE_VALUES");
+  B.setInstrAndDebugLoc(MI);
+
+  Register UndefVec = B.buildUndef(MRI.getType(UnmergeSrc)).getReg(0);
+  B.buildConcatVectors(MI.getOperand(0), {UnmergeSrc, UndefVec});
+
+  MI.eraseFromParent();
+}
+
 // This combine tries to reduce the number of scalarised G_TRUNC instructions by
 // using vector truncates instead
 //
@@ -8426,4 +8508,4 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
   }
 
   return false;
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index da19991d56259..ae86129286ddc 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -345,19 +345,11 @@ entry:
 }
 
 define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
-; CHECK-SD-LABEL: fptrunc_v2f32_v2f16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptrunc_v2f32_v2f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
 entry:
   %c = fptrunc <2 x float> %a to <2 x half>
   ret <2 x half> %c
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fce4f8e69f14d..e526a9f7bc0f6 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -5763,18 +5763,14 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: stofp_v2i64_v2f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-FP16-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-FP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-FP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i64> %a to <2 x half>
@@ -5808,18 +5804,14 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: utofp_v2i64_v2f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-FP16-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-FP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-FP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i64> %a to <2 x half>
@@ -6232,17 +6224,13 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-NOFP16-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-FP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-FP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i32> %a to <2 x half>
@@ -6267,17 +6255,13 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-NOFP16-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-FP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-FP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-FP16-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i32> %a to <2 x half>
@@ -6480,9 +6464,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-NOFP16-GI-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-NOFP16-GI-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-NOFP16-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 entry:
   %c = sitofp <2 x i16> %a to <2 x half>
@@ -6509,9 +6491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-NOFP16-GI-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-NOFP16-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NOFP16-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 entry:
   %c = uitofp <2 x i16> %a to <2 x half>
@@ -6766,9 +6746,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-NOFP16-GI-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NOFP16-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-NOFP16-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: stofp_v2i8_v2f16:
@@ -6817,9 +6795,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-NOFP16-GI-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-NOFP16-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NOFP16-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[0], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: utofp_v2i8_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index c1b8bc6031b18..f7dbcd137e742 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -762,25 +762,13 @@ define void @undef_hi3_v4f16(half %arg0) {
 }
 
 define void @undef_hi2_v4i16(<2 x i16> %arg0) {
-; GFX8-SDAG-LABEL: undef_hi2_v4i16:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    ;;#ASMSTART
-; GFX8-SDAG-NEXT:    ; use v[0:1]
-; GFX8-SDAG-NEXT:    ;;#ASMEND
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: undef_hi2_v4i16:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-GISEL-NEXT:    ;;#ASMSTART
-; GFX8-GISEL-NEXT:    ; use v[0:1]
-; GFX8-GISEL-NEXT:    ;;#ASMEND
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: undef_hi2_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v[0:1]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: undef_hi2_v4i16:
 ; GFX9:       ; %bb.0:
@@ -803,25 +791,13 @@ define void @undef_hi2_v4i16(<2 x i16> %arg0) {
 }
 
 define void @undef_hi2_v4f16(<2 x half> %arg0) {
-; GFX8-SDAG-LABEL: undef_hi2_v4f16:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    ;;#ASMSTART
-; GFX8-SDAG-NEXT:    ; use v[0:1]
-; GFX8-SDAG-NEXT:    ;;#ASMEND
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: undef_hi2_v4f16:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-GISEL-NEXT:    ;;#ASMSTART
-; GFX8-GISEL-NEXT:    ; use v[0:1]
-; GFX8-GISEL-NEXT:    ;;#ASMEND
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: undef_hi2_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use v[0:1]
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: undef_hi2_v4f16:
 ; GFX9:       ; %bb.0:
@@ -842,5 +818,3 @@ define void @undef_hi2_v4f16(<2 x half> %arg0) {
   call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi);
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX8: {{.*}}

From ff7896e0fa2e24fb4e07c33ce1c96f87b0b0344a Mon Sep 17 00:00:00 2001
From: Tim Noack <noack@esa.tu-darmstadt.de>
Date: Mon, 17 Nov 2025 16:57:24 +0100
Subject: [PATCH 028/105] [MLIR] Add verification that symbol operations must
 not have results (#168390)

This patch adds verification to the `SymbolOpInterface` to enforce the
design constraint that symbol operations must not produce SSA results,
as documented in [Symbols and
SymbolTables](https://mlir.llvm.org/docs/SymbolsAndSymbolTables/#defining-or-declaring-a-symbol).

This is a follow-up of #168376
---
 mlir/include/mlir/IR/SymbolInterfaces.td | 2 ++
 mlir/test/IR/invalid-ops.mlir            | 8 ++++++++
 mlir/test/lib/Dialect/Test/TestOps.td    | 7 +++++++
 3 files changed, 17 insertions(+)

diff --git a/mlir/include/mlir/IR/SymbolInterfaces.td b/mlir/include/mlir/IR/SymbolInterfaces.td
index bbfa30815bd4a..b3aafe063d376 100644
--- a/mlir/include/mlir/IR/SymbolInterfaces.td
+++ b/mlir/include/mlir/IR/SymbolInterfaces.td
@@ -171,6 +171,8 @@ def Symbol : OpInterface<"SymbolOpInterface"> {
     if (concreteOp.isDeclaration() && concreteOp.isPublic())
       return concreteOp.emitOpError("symbol declaration cannot have public "
              "visibility");
+    if ($_op->getNumResults() != 0)
+      return concreteOp.emitOpError("symbols must not have results");
     auto parent = $_op->getParentOp();
     if (parent && !parent->hasTrait<OpTrait::SymbolTable>() && parent->isRegistered()) {
       return concreteOp.emitOpError("symbol's parent must have the SymbolTable "
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
index 0c5fec8c4055a..2f5dd28b51911 100644
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -145,3 +145,11 @@ func.func @verify_fail_3() {
   %r = "arith.constant"() {value = -3 : si32} : () -> si32
   return
 }
+
+// -----
+
+// Verify that symbols with results are rejected
+module {
+  // expected-error@+1 {{'test.symbol_with_result' op symbols must not have results}}
+  %0 = "test.symbol_with_result"() <{sym_name = "test_symbol"}> : () -> i32
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 275025978a784..670223984fd95 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -120,6 +120,13 @@ def SymbolOp : TEST_Op<"symbol", [NoMemoryEffect, Symbol]> {
                        OptionalAttr<StrAttr>:$sym_visibility);
 }
 
+def SymbolWithResultOp : TEST_Op<"symbol_with_result", [Symbol]> {
+  let summary = "invalid symbol operation that produces an SSA result";
+  let arguments = (ins StrAttr:$sym_name,
+                       OptionalAttr<StrAttr>:$sym_visibility);
+  let results = (outs AnyType:$result);
+}
+
 def OverriddenSymbolVisibilityOp : TEST_Op<"overridden_symbol_visibility", [
   DeclareOpInterfaceMethods<Symbol, ["getVisibility", "setVisibility"]>,
 ]> {

From 498a01db9b1a5424e28665aa0c02eacad5ab027f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Nov 2025 07:59:37 -0800
Subject: [PATCH 029/105] [Option] Use llvm::is_contained (NFC) (#168295)

Identified with llvm-use-ranges.
---
 llvm/lib/Option/OptTable.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index 20398b5f582f4..065036cedc2ae 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -796,8 +796,7 @@ void OptTable::internalPrintHelp(
     unsigned ActiveSubCommandID = ActiveSubCommand - &SubCommands[0];
     // Print if the ActiveSubCommandID is registered with the CandidateInfo
     // Option.
-    return std::find(SubCommandIDs.begin(), SubCommandIDs.end(),
-                     ActiveSubCommandID) != SubCommandIDs.end();
+    return llvm::is_contained(SubCommandIDs, ActiveSubCommandID);
   };
 
   for (unsigned Id = 1, e = getNumOptions() + 1; Id != e; ++Id) {

From 99bf41cd11daa3ee32431c12ff5084fc90f1f91d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Nov 2025 07:59:45 -0800
Subject: [PATCH 030/105] [TargetParser] Use range-based for loops (#168296)

While I am at it, this patch converts one of the loops to use
llvm::is_contained.

Identified with modernize-loop-convert.
---
 llvm/lib/TargetParser/Host.cpp | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index c164762de2966..3f9f69549f2db 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -512,11 +512,11 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
 
   // Look for the CPU features.
   SmallVector<StringRef, 32> CPUFeatures;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].starts_with("features")) {
-      size_t Pos = Lines[I].find(':');
+  for (StringRef Line : Lines)
+    if (Line.starts_with("features")) {
+      size_t Pos = Line.find(':');
       if (Pos != StringRef::npos) {
-        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
+        Line.drop_front(Pos + 1).split(CPUFeatures, ' ');
         break;
       }
     }
@@ -524,20 +524,16 @@ StringRef sys::detail::getHostCPUNameForS390x(StringRef ProcCpuinfoContent) {
   // We need to check for the presence of vector support independently of
   // the machine type, since we may only use the vector register set when
   // supported by the kernel (and hypervisor).
-  bool HaveVectorSupport = false;
-  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
-    if (CPUFeatures[I] == "vx")
-      HaveVectorSupport = true;
-  }
+  bool HaveVectorSupport = llvm::is_contained(CPUFeatures, "vx");
 
   // Now check the processor machine type.
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-    if (Lines[I].starts_with("processor ")) {
-      size_t Pos = Lines[I].find("machine = ");
+  for (StringRef Line : Lines) {
+    if (Line.starts_with("processor ")) {
+      size_t Pos = Line.find("machine = ");
       if (Pos != StringRef::npos) {
         Pos += sizeof("machine = ") - 1;
         unsigned int Id;
-        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id))
+        if (!Line.drop_front(Pos).getAsInteger(10, Id))
           return getCPUNameFromS390Model(Id, HaveVectorSupport);
       }
       break;
@@ -554,9 +550,9 @@ StringRef sys::detail::getHostCPUNameForRISCV(StringRef ProcCpuinfoContent) {
 
   // Look for uarch line to determine cpu name
   StringRef UArch;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-    if (Lines[I].starts_with("uarch")) {
-      UArch = Lines[I].substr(5).ltrim("\t :");
+  for (StringRef Line : Lines) {
+    if (Line.starts_with("uarch")) {
+      UArch = Line.substr(5).ltrim("\t :");
       break;
     }
   }

From bf21156ee69a7c8b27733d037abc0ab108451de9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Nov 2025 07:59:53 -0800
Subject: [PATCH 031/105] [IPO] Remove a redundant cast (NFC) (#168297)

Idx is already of type unsigned.

Identified with readability-redundant-casting.
---
 llvm/lib/Transforms/IPO/IROutliner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index e6ddc8029ce7b..6e1ca9c4cd2d6 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -2108,8 +2108,7 @@ static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap,
 
   for (Value *RetVal : SortedKeys) {
     BasicBlock *NewBB = BasicBlock::Create(
-        ParentFunc->getContext(),
-        Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)),
+        ParentFunc->getContext(), Twine(BaseName) + Twine("_") + Twine(Idx++),
         ParentFunc);
     NewMap.insert(std::make_pair(RetVal, NewBB));
   }

From dcf8cd9c5b7cf15a07850484c6bb50516c4faacd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Nov 2025 08:00:01 -0800
Subject: [PATCH 032/105] [ADT] Consolidate the grow() logic in DenseMapBase
 (NFC) (#168316)

This patch consolidates the grow() logic in DenseMapBase::grow.

With this patch, DenseMapBase::grow() creates a temporary grown
instance and then lets DenseMap/SmallDenseMap attempt to move the
instance back to *this.  If it doesn't work, we move again.

The "attempt to move" always succeeds for DenseMap.  For
SmallDenseMap, it succeeds only in the large mode.

This is part of the effort outlined in #168255.
---
 llvm/include/llvm/ADT/DenseMap.h | 39 +++++++++++++++-----------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index aa5d8a8729647..887f1af2417c2 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -558,7 +558,12 @@ class DenseMapBase : public DebugEpochBase {
 
   void grow(unsigned MinNumBuckets) {
     unsigned NumBuckets = DerivedT::roundUpNumBuckets(MinNumBuckets);
-    derived().grow(NumBuckets);
+    DerivedT Tmp(NumBuckets, ExactBucketCount{});
+    Tmp.moveFrom(derived());
+    if (derived().maybeMoveFast(std::move(Tmp)))
+      return;
+    initWithExactBucketCount(NumBuckets);
+    moveFrom(Tmp);
   }
 
   template <typename LookupKeyT>
@@ -842,10 +847,9 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
                     static_cast<unsigned>(NextPowerOf2(MinNumBuckets - 1)));
   }
 
-  void grow(unsigned AtLeast) {
-    DenseMap Tmp(AtLeast, typename BaseT::ExactBucketCount{});
-    Tmp.moveFrom(*this);
-    swapImpl(Tmp);
+  bool maybeMoveFast(DenseMap &&Other) {
+    swapImpl(Other);
+    return true;
   }
 
   // Plan how to shrink the bucket table.  Return:
@@ -1110,23 +1114,16 @@ class SmallDenseMap
                     static_cast<unsigned>(NextPowerOf2(MinNumBuckets - 1)));
   }
 
-  void grow(unsigned NumBuckets) {
-    SmallDenseMap Tmp(NumBuckets, typename BaseT::ExactBucketCount{});
-    Tmp.moveFrom(*this);
+  bool maybeMoveFast(SmallDenseMap &&Other) {
+    if (Other.Small)
+      return false;
 
-    if (Tmp.Small) {
-      // Use moveFrom in those rare cases where we stay in the small mode.  This
-      // can happen when we have many tombstones.
-      Small = true;
-      this->BaseT::initEmpty();
-      this->moveFrom(Tmp);
-    } else {
-      Small = false;
-      NumEntries = Tmp.NumEntries;
-      NumTombstones = Tmp.NumTombstones;
-      *getLargeRep() = std::move(*Tmp.getLargeRep());
-      Tmp.getLargeRep()->NumBuckets = 0;
-    }
+    Small = false;
+    NumEntries = Other.NumEntries;
+    NumTombstones = Other.NumTombstones;
+    *getLargeRep() = std::move(*Other.getLargeRep());
+    Other.getLargeRep()->NumBuckets = 0;
+    return true;
   }
 
   // Plan how to shrink the bucket table.  Return:

From 3c54972def503440e351a0ec6553c8fed884fe13 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 17 Nov 2025 08:00:09 -0800
Subject: [PATCH 033/105] [ADT] Remove DenseMap::init (NFC) (#168322)

This patch removes DenseMap::init and SmallDenseMap::init by inlining
them into their call sites and simplifying them.

init() is defined as:

  void init(unsigned InitNumEntries) {
auto InitBuckets =
BaseT::getMinBucketToReserveForEntries(InitNumEntries);
    this->initWithExactBucketCount(InitBuckets);
  }

- Constuctors: Now that we have constructors that allocate the exact
  number of buckets (as opposed to the number of key/value pairs),
  init() does too much.  Once we convert the number of key/value pairs
  to the number of buckets, we can call the constructors that take the
  exact number of buckets.

- init(0) in the move assignment operators simplifies down to:

    initWithExactBucketCount(0)

- shrink_and_clear() computes the number of buckets to have after the
  clear operation.  As such, we should call initWithExactBucketCount,
  not init.  Otherwise, we would end up adding "load factor padding"
  on top of NewNumBuckets:

    NextPowerOf2(NewNumBuckets * 4 / 3 + 1)

All in all, init() doesn't bring any value in the current setup.

This patch is part of the effort outlined in #168255.
---
 llvm/include/llvm/ADT/DenseMap.h | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 887f1af2417c2..a706f68fab81b 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -162,7 +162,7 @@ class DenseMapBase : public DebugEpochBase {
       return;
     }
     derived().deallocateBuckets();
-    derived().init(NewNumBuckets);
+    initWithExactBucketCount(NewNumBuckets);
   }
 
   /// Return true if the specified key is in the map, false otherwise.
@@ -755,9 +755,9 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 public:
   /// Create a DenseMap with an optional \p NumElementsToReserve to guarantee
   /// that this number of elements can be inserted in the map without grow().
-  explicit DenseMap(unsigned NumElementsToReserve = 0) {
-    init(NumElementsToReserve);
-  }
+  explicit DenseMap(unsigned NumElementsToReserve = 0)
+      : DenseMap(BaseT::getMinBucketToReserveForEntries(NumElementsToReserve),
+                 typename BaseT::ExactBucketCount{}) {}
 
   DenseMap(const DenseMap &other) : DenseMap() { this->copyFrom(other); }
 
@@ -789,7 +789,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
   DenseMap &operator=(DenseMap &&other) {
     this->destroyAll();
     deallocateBuckets();
-    init(0);
+    this->initWithExactBucketCount(0);
     this->swap(other);
     return *this;
   }
@@ -830,11 +830,6 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
     return true;
   }
 
-  void init(unsigned InitNumEntries) {
-    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
-    this->initWithExactBucketCount(InitBuckets);
-  }
-
   // Put the zombie instance in a known good state after a move.
   void kill() {
     deallocateBuckets();
@@ -902,9 +897,10 @@ class SmallDenseMap
   }
 
 public:
-  explicit SmallDenseMap(unsigned NumElementsToReserve = 0) {
-    init(NumElementsToReserve);
-  }
+  explicit SmallDenseMap(unsigned NumElementsToReserve = 0)
+      : SmallDenseMap(
+            BaseT::getMinBucketToReserveForEntries(NumElementsToReserve),
+            typename BaseT::ExactBucketCount{}) {}
 
   SmallDenseMap(const SmallDenseMap &other) : SmallDenseMap() {
     this->copyFrom(other);
@@ -939,7 +935,7 @@ class SmallDenseMap
   SmallDenseMap &operator=(SmallDenseMap &&other) {
     this->destroyAll();
     deallocateBuckets();
-    init(0);
+    this->initWithExactBucketCount(0);
     this->swap(other);
     return *this;
   }
@@ -1095,11 +1091,6 @@ class SmallDenseMap
     return true;
   }
 
-  void init(unsigned InitNumEntries) {
-    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
-    this->initWithExactBucketCount(InitBuckets);
-  }
-
   // Put the zombie instance in a known good state after a move.
   void kill() {
     deallocateBuckets();

From e69d2bf6031a534c824d1ce4191f5fc334ab4ae6 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Mon, 17 Nov 2025 17:05:18 +0100
Subject: [PATCH 034/105] [OpenMP][omptest] Fix missing source extention

The file extention was accidentally omitted from #164794.
---
 openmp/tools/omptest/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/tools/omptest/test/CMakeLists.txt b/openmp/tools/omptest/test/CMakeLists.txt
index 1e07a1044f7d6..2b4aa78b0bc16 100644
--- a/openmp/tools/omptest/test/CMakeLists.txt
+++ b/openmp/tools/omptest/test/CMakeLists.txt
@@ -9,7 +9,7 @@ set(UNITTEST_SOURCES
   unittests/asserter-seq-test.cpp
   unittests/internal-event-eq-test.cpp
   unittests/internal-event-tostring-test.cpp
-  unittests/internal-util-test
+  unittests/internal-util-test.cpp
   unittests/main-test.cpp
 )
 add_executable(omptest-unittests ${UNITTEST_SOURCES})

From a9633aac31cb1ec42153fb3ada815aa1572eb58f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 17 Nov 2025 17:31:55 +0100
Subject: [PATCH 035/105] [libc++] Fix __hash_table::erase(iterator, iterator)
 to update the bucket list correctly when erasing the last bucket (#167865)

Fixes #167820
---
 libcxx/include/__hash_table                   |  2 ++
 .../unord.map.modifiers/erase_range.pass.cpp  | 22 +++++++++++++++++++
 .../erase_range.pass.cpp                      | 22 +++++++++++++++++++
 .../unord/unord.multiset/erase_range.pass.cpp | 22 +++++++++++++++++++
 .../unord/unord.set/erase_range.pass.cpp      | 22 +++++++++++++++++++
 5 files changed, 90 insertions(+)

diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index e1897949a47e6..ef487fb06dd5e 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -1910,6 +1910,8 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, const_it
         __bucket_list_[__next_chash] = __before_first;
         __chash                      = __next_chash;
       }
+    } else { // When __next is a nullptr we've fully erased the last bucket. Update the bucket list accordingly.
+      __bucket_list_[__chash] = nullptr;
     }
   }
 
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
index 532413437f6be..81371638143c9 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
@@ -57,6 +57,28 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_map<int, int> m;
+    m.insert(std::make_pair(1, 1));
+    m.insert(std::make_pair(2, 2));
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(std::make_pair(3, 3));
+    assert(m.size() == 1);
+    assert(*m.begin() == std::make_pair(3, 3));
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_map<int,
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
index 38b75c0c1986b..aa6bc20e4090b 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
@@ -122,6 +122,28 @@ int main(int, char**) {
     for (const auto& v : map)
       assert(v.first == 1 || v.first == collision_val);
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_multimap<int, int> m;
+    m.insert(std::make_pair(1, 1));
+    m.insert(std::make_pair(2, 2));
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(std::make_pair(3, 3));
+    assert(m.size() == 1);
+    assert(*m.begin() == std::make_pair(3, 3));
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multimap<int,
diff --git a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
index 3bc686ec2d86e..013e052e530de 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
@@ -64,6 +64,28 @@ int main(int, char**) {
     for (const auto& v : map)
       assert(v == 1 || v == collision_val);
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_multiset<int> m;
+    m.insert(1);
+    m.insert(2);
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(3);
+    assert(m.size() == 1);
+    assert(*m.begin() == 3);
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multiset<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;
diff --git a/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
index 5fa6e4199f756..1f049a295b8c3 100644
--- a/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
@@ -47,6 +47,28 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_set<int> m;
+    m.insert(1);
+    m.insert(2);
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(3);
+    assert(m.size() == 1);
+    assert(*m.begin() == 3);
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_set<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;

From 7659cd42578c59d1bef1313053d493171b9146a2 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 17 Nov 2025 16:36:42 +0000
Subject: [PATCH 036/105] [VectorUtils] Use PatternMatch in findScalarElement
 (NFC) (#168389)

---
 llvm/lib/Analysis/VectorUtils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 530fa9518f40e..a3e9b039f9225 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -317,9 +317,9 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
 
   if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert to a variable element, we don't know what it is.
-    if (!isa<ConstantInt>(III->getOperand(2)))
+    uint64_t IIElt;
+    if (!match(III->getOperand(2), m_ConstantInt(IIElt)))
       return nullptr;
-    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
 
     // If this is an insert to the element we are looking for, return the
     // inserted value.

From 4dd27960706cf2681b72cc2cf7cd8ccbcf0f4f9d Mon Sep 17 00:00:00 2001
From: vangthao95 <vang.thao@amd.com>
Date: Mon, 17 Nov 2025 08:45:49 -0800
Subject: [PATCH 037/105] [AMDGPU][GlobalISel] Add RegBankLegalize support for
 G_FMUL (#167847)

---
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   2 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll   | 165 ++++++++++++++++++
 .../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll   |   2 +
 .../AMDGPU/GlobalISel/regbankselect-fmul.mir  |   5 +-
 4 files changed, 171 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 90114e44f1a48..b81a08de383d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -935,7 +935,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
-  addRulesForGOpcs({G_FADD}, Standard)
+  addRulesForGOpcs({G_FADD, G_FMUL}, Standard)
       .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
       .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
new file mode 100644
index 0000000000000..84ac58f899717
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fmul_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_uniform:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_uniform:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul half %a, %b
+  ret half %result
+}
+
+define amdgpu_ps half @fmul_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_div:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_div:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fmul_s16_div:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fmul_s16_div:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+  %result = fmul half %a, %b
+  ret half %result
+}
+
+define amdgpu_ps float @fmul_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fmul_s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul float %a, %b
+  ret float %result
+}
+
+define amdgpu_ps float @fmul_s32_div(float %a, float %b) {
+; GCN-LABEL: fmul_s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul float %a, %b
+  ret float %result
+}
+
+define amdgpu_ps void @fmul_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmul_s64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %result = fmul double %a, %b
+  store double %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @fmul_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmul_s64_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    s_endpgm
+  %result = fmul double %a, %b
+  store double %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fmul_v2s16_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_mul_f16 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    s_mul_f16 s1, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x half> %a, %b
+  ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fmul_v2s16_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x half> %a, %b
+  ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fmul_v2s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v0, s0, s2
+; GFX11-NEXT:    v_mul_f32_e64 v1, s1, s3
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f32 s0, s0, s2
+; GFX12-NEXT:    s_mul_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x float> %a, %b
+  ret <2 x float> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fmul_v2s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x float> %a, %b
+  ret <2 x float> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index e03aa18d3147f..1220c0e3b1ead 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
 
+; TODO: Switch test to use -new-reg-bank-select after adding G_FNEG support.
+
 define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16:
 ; GFX9:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
index 5766c05426b2d..f289566a27c12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
 ---
 name: fmul_ss
@@ -17,6 +17,7 @@ body: |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FMUL]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_FMUL %0, %1

From d163988dd2833f28fbca8c144265108d25ae7bd2 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Mon, 17 Nov 2025 22:18:14 +0530
Subject: [PATCH 038/105] [MLIR][NVVM][NFC] Re-order mem_scope and shared_space
 attrs (#168348)

The mbarrier Ops also require access to the `mem_scope` and
`shared_space` attributes. Hence, this patch moves their definitions
to the beginning of the file alongside the other attribute definitions.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 53 +++++++++++----------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d4ef5104d3c1f..456d816205b58 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -228,6 +228,33 @@ def NVVMMemorySpaceAttr :
   let assemblyFormat = "`<` $value `>`";
 }
 
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+// Attrs to disambiguate the cta or cluster space within shared memory
+def SharedSpaceCTA : I32EnumAttrCase<"shared_cta", 0, "cta">;
+def SharedSpaceCluster   : I32EnumAttrCase<"shared_cluster", 1, "cluster">;
+def SharedSpace : I32EnumAttr<"SharedSpace", "Shared memory space",
+  [SharedSpaceCTA, SharedSpaceCluster]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM intrinsic operations
 //===----------------------------------------------------------------------===//
@@ -1107,17 +1134,6 @@ def NVVM_FenceScClusterOp : NVVM_Op<"fence.sc.cluster"> {
   let assemblyFormat = "attr-dict";
 }
 
-def SharedSpaceCTA : I32EnumAttrCase<"shared_cta", 0, "cta">;
-def SharedSpaceCluster   : I32EnumAttrCase<"shared_cluster", 1, "cluster">;
-def SharedSpace : I32EnumAttr<"SharedSpace", "Shared memory space",
-  [SharedSpaceCTA, SharedSpaceCluster]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
 def ProxyAsync   : I32EnumAttrCase<"async", 1, "async">;
 def ProxyAsyncGlobal   : I32EnumAttrCase<"async_global", 2, "async.global">;
@@ -1158,21 +1174,6 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
   let hasVerifier = 1;
 }
 
-// Attrs describing the scope of the Memory Operation
-def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
-def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
-def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
-def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
-
-def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
-  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
       Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
                      DefaultValuedAttr<ProxyKindAttr,

From 83fc85c9547e7e55b326f86e946cff8358cbe0c2 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine <saugustine@google.com>
Date: Mon, 17 Nov 2025 08:50:14 -0800
Subject: [PATCH 039/105] Remove shadowing "size" field from classes that
 inherit from SyntheticSection (#166323)

A field-named 'size' already available and perfectly usable via
inheritance from InputSection, and these variables shadow it for no good
reason.

The only interesting change here is in PaddingSection, because a
parent's field cannot be initialized via a constructor initializer list,
setting it needs to be done inside the constructor body.
---
 lld/ELF/SyntheticSections.cpp |  6 +++---
 lld/ELF/SyntheticSections.h   | 13 +------------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 9a70c0d19c41d..19b08152ae081 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2747,9 +2747,9 @@ RelroPaddingSection::RelroPaddingSection(Ctx &ctx)
     : SyntheticSection(ctx, ".relro_padding", SHT_NOBITS, SHF_ALLOC | SHF_WRITE,
                        1) {}
 
-PaddingSection::PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent)
-    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1),
-      size(size) {
+PaddingSection::PaddingSection(Ctx &ctx, uint64_t amount, OutputSection *parent)
+    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1) {
+  size = amount;
   this->parent = parent;
 }
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 38e68110e4bc0..66c866d7e8cde 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -78,8 +78,6 @@ class EhFrameSection final : public SyntheticSection {
   // allocating one for each EhInputSection.
   llvm::DenseMap<size_t, CieRecord *> offsetToCie;
 
-  uint64_t size = 0;
-
   template <llvm::endianness E> void addRecords(EhInputSection *s);
   template <class ELFT>
   void iterateFDEWithLSDAAux(EhInputSection &sec,
@@ -127,7 +125,6 @@ class GotSection final : public SyntheticSection {
 protected:
   size_t numEntries = 0;
   uint32_t tlsIndexOff = -1;
-  uint64_t size = 0;
   struct AuthEntryInfo {
     size_t offset;
     bool isSymbolFunc;
@@ -182,7 +179,6 @@ class BssSection final : public SyntheticSection {
   static bool classof(const SectionBase *s) {
     return isa<SyntheticSection>(s) && cast<SyntheticSection>(s)->bss;
   }
-  uint64_t size;
 };
 
 class MipsGotSection final : public SyntheticSection {
@@ -312,8 +308,6 @@ class MipsGotSection final : public SyntheticSection {
   // Number of "Header" entries.
   static const unsigned headerEntriesNum = 2;
 
-  uint64_t size = 0;
-
   // Symbol and addend.
   using GotEntry = std::pair<Symbol *, int64_t>;
 
@@ -407,8 +401,6 @@ class StringTableSection final : public SyntheticSection {
 private:
   const bool dynamic;
 
-  uint64_t size = 0;
-
   llvm::DenseMap<llvm::CachedHashStringRef, unsigned> stringMap;
   SmallVector<StringRef, 0> strings;
 };
@@ -475,7 +467,6 @@ template <class ELFT> class DynamicSection final : public SyntheticSection {
 
 private:
   std::vector<std::pair<int32_t, uint64_t>> computeContents();
-  uint64_t size = 0;
 };
 
 class RelocationBaseSection : public SyntheticSection {
@@ -780,10 +771,8 @@ class RelroPaddingSection final : public SyntheticSection {
 };
 
 class PaddingSection final : public SyntheticSection {
-  uint64_t size;
-
 public:
-  PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent);
+  PaddingSection(Ctx &ctx, uint64_t amount, OutputSection *parent);
   size_t getSize() const override { return size; }
   void writeTo(uint8_t *buf) override;
 };

From 35ae5157c0ab98a90231ff655b1a47d3f8a20d2b Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Mon, 17 Nov 2025 17:53:51 +0100
Subject: [PATCH 040/105] [MLIR][NVVM][Docs] Explain memory spaces (#168059)

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 34 +++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 456d816205b58..6e3a92b5bde42 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -79,6 +79,40 @@ def NVVM_Dialect : Dialect {
     sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to
     embed PTX inline as a last-resort escape hatch, with explicit operands and
     results.
+
+
+    **Memory Spaces:** The NVVM dialect introduces the following memory spaces,
+    each with distinct scopes and lifetimes:
+```
+    | Memory Space      | Address Space | Scope                | Lifetime          |
+    |-------------------|---------------|----------------------|-------------------|
+    | `generic`         | 0             | All threads          | Context-dependent |
+    | `global`          | 1             | All threads (device) | Application       |
+    | `shared`          | 3             | Thread block (CTA)   | Kernel execution  |
+    | `constant`        | 4             | All threads (RO)     | Application       |
+    | `local`           | 5             | Single thread        | Kernel execution  |
+    | `tensor`          | 6             | Thread block (CTA)   | Kernel execution  |
+    | `shared_cluster`  | 7             | Thread block cluster | Kernel execution  |
+```
+    **Memory Space Details:**
+    - **generic**: Can point to any memory space; requires runtime resolution of
+      actual address space. Use when pointer origin is unknown at compile time.
+      Performance varies based on the underlying memory space.
+    - **global**: Accessible by all threads across all blocks; persists across
+      kernel launches. Highest latency but largest capacity (device memory). Best
+      for large data and inter-kernel communication.
+    - **shared**: Shared within a thread block (CTA); very fast on-chip memory for
+      cooperation between threads in the same block. Limited capacity. Ideal for 
+      block-level collaboration, caching, and reducing global memory traffic.
+    - **constant**: Read-only memory cached per SM. Size typically limited to 
+      64KB. Best for read-only data and uniform values accessed by all threads.
+    - **local**: Private to each thread. Use for per-thread private data and
+      automatic variables that don't fit in registers.
+    - **tensor**: Special memory space for tensor core operations. Used by
+      `tcgen05` instructions on SM 100+ for tensor input/output operations.
+    - **shared_cluster**: Distributed shared memory across thread blocks within
+      a cluster (SM 90+). Enables collaboration beyond single-block scope with
+      fast access across cluster threads.
   }];
 
   let name = "nvvm";

From c66f1fdfb74802204afc425317062017d2487194 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 17 Nov 2025 09:00:15 -0800
Subject: [PATCH 041/105] [MC] Use MCRegister::id() to avoid implicit casts.
 NFC (#168233)

---
 llvm/include/llvm/MC/MCRegisterInfo.h |  6 +++---
 llvm/lib/MC/MCInst.cpp                |  2 +-
 llvm/lib/MC/MCRegisterInfo.cpp        | 25 +++++++++++++------------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index 6e36e580358e7..f4897b6a406fb 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -438,7 +438,7 @@ class LLVM_ABI MCRegisterInfo {
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
-  virtual int64_t getDwarfRegNum(MCRegister RegNum, bool isEH) const;
+  virtual int64_t getDwarfRegNum(MCRegister Reg, bool isEH) const;
 
   /// Map a dwarf register back to a target register. Returns std::nullopt if
   /// there is no mapping.
@@ -450,11 +450,11 @@ class LLVM_ABI MCRegisterInfo {
 
   /// Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
-  int getSEHRegNum(MCRegister RegNum) const;
+  int getSEHRegNum(MCRegister Reg) const;
 
   /// Map a target register to an equivalent CodeView register
   /// number.
-  int getCodeViewRegNum(MCRegister RegNum) const;
+  int getCodeViewRegNum(MCRegister Reg) const;
 
   regclass_iterator regclass_begin() const { return Classes; }
   regclass_iterator regclass_end() const { return Classes+NumClasses; }
diff --git a/llvm/lib/MC/MCInst.cpp b/llvm/lib/MC/MCInst.cpp
index 46a6a18e15963..61eeb5e5a5c71 100644
--- a/llvm/lib/MC/MCInst.cpp
+++ b/llvm/lib/MC/MCInst.cpp
@@ -29,7 +29,7 @@ void MCOperand::print(raw_ostream &OS, const MCContext *Ctx) const {
     if (Ctx && Ctx->getRegisterInfo())
       OS << Ctx->getRegisterInfo()->getName(getReg());
     else
-      OS << getReg();
+      OS << getReg().id();
   } else if (isImm())
     OS << "Imm:" << getImm();
   else if (isSFPImm())
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index 7fd92bf974b95..77fb7332619cd 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -89,7 +89,7 @@ ArrayRef<MCPhysReg> MCRegisterInfo::getCachedAliasesOf(MCRegister R) const {
     return Aliases;
 
   for (MCRegAliasIteratorImpl It(R, this); It.isValid(); ++It)
-    Aliases.push_back(*It);
+    Aliases.push_back((*It).id());
 
   sort(Aliases);
   Aliases.erase(unique(Aliases), Aliases.end());
@@ -141,15 +141,15 @@ unsigned MCRegisterInfo::getSubRegIndex(MCRegister Reg,
   return 0;
 }
 
-int64_t MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
+int64_t MCRegisterInfo::getDwarfRegNum(MCRegister Reg, bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
   unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
 
   if (!M)
     return -1;
-  DwarfLLVMRegPair Key = { RegNum, 0 };
+  DwarfLLVMRegPair Key = {Reg.id(), 0};
   const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-  if (I == M+Size || I->FromReg != RegNum)
+  if (I == M + Size || I->FromReg != Reg)
     return -1;
   // Consumers need to be able to detect -1 and -2, but at various points
   // the numbers move between unsigned and signed representations, as well as
@@ -191,20 +191,21 @@ int64_t MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(uint64_t RegNum) const {
   return RegNum;
 }
 
-int MCRegisterInfo::getSEHRegNum(MCRegister RegNum) const {
-  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(RegNum);
-  if (I == L2SEHRegs.end()) return (int)RegNum;
+int MCRegisterInfo::getSEHRegNum(MCRegister Reg) const {
+  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(Reg);
+  if (I == L2SEHRegs.end())
+    return (int)Reg.id();
   return I->second;
 }
 
-int MCRegisterInfo::getCodeViewRegNum(MCRegister RegNum) const {
+int MCRegisterInfo::getCodeViewRegNum(MCRegister Reg) const {
   if (L2CVRegs.empty())
     report_fatal_error("target does not implement codeview register mapping");
-  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(RegNum);
+  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(Reg);
   if (I == L2CVRegs.end())
-    report_fatal_error("unknown codeview register " + (RegNum < getNumRegs()
-                                                           ? getName(RegNum)
-                                                           : Twine(RegNum)));
+    report_fatal_error("unknown codeview register " + (Reg.id() < getNumRegs()
+                                                           ? getName(Reg)
+                                                           : Twine(Reg.id())));
   return I->second;
 }
 

From 39e7712ac520ccfc43383b3e9d6ea8cf2958b8e3 Mon Sep 17 00:00:00 2001
From: Dharuni R Acharya <125176188+DharuniRAcharya@users.noreply.github.com>
Date: Mon, 17 Nov 2025 22:30:40 +0530
Subject: [PATCH 042/105] [LLVM-Tablegen] Pretty Printing Arguments in LLVM
 Intrinsics (#162629)

This patch adds LLVM infrastructure to support pretty printing of the
intrinsic arguments.
The motivation is to improve the readability of LLVM intrinsics and
facilitate easy
modifications and debugging of LLVM IR.

This feature adds a property `ArgInfo<ArgIndex, [ArgName<"argName">,
ImmArgPrinter<"functionName">]>`
to the intrinsic arguments to print self-explanatory inline comments for
the arguments.

The addition of pretty print support can provide a simple, low-overhead
feature that
enhances the usability of LLVM intrinsics without disrupting existing
workflows.

Link to the RFC, where this feature was discussed:

https://discourse.llvm.org/t/rfc-pretty-printing-immediate-arguments-in-llvm-intrinsics/88536

---------

Signed-off-by: Dharuni R Acharya <dharunira@nvidia.com>
Co-authored-by: Rahul Joshi <rjoshi@nvidia.com>
---
 llvm/include/llvm/IR/Intrinsics.h             |   9 ++
 llvm/include/llvm/IR/Intrinsics.td            |  19 ++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  16 ++-
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h     |  48 ++++++++
 llvm/lib/IR/AsmWriter.cpp                     |  41 +++++--
 llvm/lib/IR/Intrinsics.cpp                    |  11 ++
 .../NVPTX/tcgen05-mma-tensor-formatted.ll     |  50 +++++++++
 llvm/test/TableGen/intrinsic-arginfo.td       |  71 ++++++++++++
 .../TableGen/Basic/CodeGenIntrinsics.cpp      |  36 ++++++
 llvm/utils/TableGen/Basic/CodeGenIntrinsics.h |  16 +++
 .../utils/TableGen/Basic/IntrinsicEmitter.cpp | 105 +++++++++++++++---
 11 files changed, 395 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
 create mode 100644 llvm/test/TableGen/intrinsic-arginfo.td

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 9577d0141f168..c91fc254ebe11 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -30,6 +30,8 @@ class LLVMContext;
 class Module;
 class AttributeList;
 class AttributeSet;
+class raw_ostream;
+class Constant;
 
 /// This namespace contains an enum with a value for every intrinsic/builtin
 /// function known by LLVM. The enum values are returned by
@@ -81,6 +83,9 @@ namespace Intrinsic {
   /// Returns true if the intrinsic can be overloaded.
   LLVM_ABI bool isOverloaded(ID id);
 
+  /// Returns true if the intrinsic has pretty printed immediate arguments.
+  LLVM_ABI bool hasPrettyPrintedArgs(ID id);
+
   /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a
   /// certain target. If it is a generic intrinsic false is returned.
   LLVM_ABI bool isTargetIntrinsic(ID IID);
@@ -284,6 +289,10 @@ namespace Intrinsic {
   /// N.
   LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
 
+  /// Print the argument info for the arguments with ArgInfo.
+  LLVM_ABI void printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS,
+                            const Constant *ImmArgVal);
+
   } // namespace Intrinsic
 
   } // namespace llvm
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 07aa2faffa7c5..27f404a1be65c 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -142,6 +142,25 @@ class Range<AttrIndex idx, int lower, int upper> : IntrinsicProperty {
   int Upper = upper;
 }
 
+// ArgProperty - Base class for argument properties that can be specified in ArgInfo.
+class ArgProperty;
+
+// ArgName - Specifies the name of an argument for pretty-printing.
+class ArgName<string name> : ArgProperty {
+  string Name = name;
+}
+
+// ImmArgPrinter - Specifies a custom printer function for immediate arguments.
+class ImmArgPrinter<string funcname> : ArgProperty {
+  string FuncName = funcname;
+}
+
+// ArgInfo - The specified argument has properties defined by a list of ArgProperty objects.
+class ArgInfo<ArgIndex idx, list<ArgProperty> arg_properties> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+  list<ArgProperty> Properties = arg_properties;
+}
+
 def IntrNoReturn : IntrinsicProperty;
 
 // Applied by default.
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 21badc2692037..1b485dc8ccd1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2955,7 +2955,14 @@ foreach sp = [0, 1] in {
         defvar nargs = !size(args);
         defvar scale_d_imm = ArgIndex<!sub(nargs, 1)>;
         defvar scale_d_imm_range = [ImmArg<scale_d_imm>, Range<scale_d_imm, 0, 16>];
-        defvar intrinsic_properties = !listconcat(
+
+        // Check if this is the specific llvm.nvvm.tcgen05.mma.tensor intrinsic.
+        defvar is_target_intrinsic = !and(!eq(sp, 0), 
+                                          !eq(space, "tensor"), 
+                                          !eq(scale_d, 0), 
+                                          !eq(ashift, 0));
+
+        defvar base_properties = !listconcat(
           mma.common_intr_props,
           !if(!eq(scale_d, 1), scale_d_imm_range, []),
           [Range<ArgIndex<nargs>, 0, !if(!eq(scale_d, 1), 2, 4)>, // kind
@@ -2965,6 +2972,13 @@ foreach sp = [0, 1] in {
           ]
         );
 
+        defvar intrinsic_properties = !if(is_target_intrinsic, 
+          !listconcat(base_properties,
+            [ArgInfo<ArgIndex<nargs>, [ArgName<"kind">, ImmArgPrinter<"printTcgen05MMAKind">]>,
+             ArgInfo<ArgIndex<!add(nargs, 1)>, [ArgName<"cta_group">]>,
+             ArgInfo<ArgIndex<!add(nargs, 2)>, [ArgName<"collector">, ImmArgPrinter<"printTcgen05CollectorUsageOp">]>]),
+          base_properties);
+
         def mma.record_name:
               DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties,
                 mma.intr_name>;
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index d55100e5e709d..d383769043605 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -18,8 +18,11 @@
 #include <stdint.h>
 
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace nvvm {
@@ -659,6 +662,51 @@ inline APFloat::roundingMode GetFMARoundingMode(Intrinsic::ID IntrinsicID) {
   llvm_unreachable("Invalid FP instrinsic rounding mode for NVVM fma");
 }
 
+inline void printTcgen05MMAKind(raw_ostream &OS, const Constant *ImmArgVal) {
+  if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+    uint64_t Val = CI->getZExtValue();
+    switch (static_cast<Tcgen05MMAKind>(Val)) {
+    case Tcgen05MMAKind::F16:
+      OS << "f16";
+      return;
+    case Tcgen05MMAKind::TF32:
+      OS << "tf32";
+      return;
+    case Tcgen05MMAKind::F8F6F4:
+      OS << "f8f6f4";
+      return;
+    case Tcgen05MMAKind::I8:
+      OS << "i8";
+      return;
+    }
+  }
+  llvm_unreachable(
+      "printTcgen05MMAKind called with invalid value for immediate argument");
+}
+
+inline void printTcgen05CollectorUsageOp(raw_ostream &OS,
+                                         const Constant *ImmArgVal) {
+  if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+    uint64_t Val = CI->getZExtValue();
+    switch (static_cast<Tcgen05CollectorUsageOp>(Val)) {
+    case Tcgen05CollectorUsageOp::DISCARD:
+      OS << "discard";
+      return;
+    case Tcgen05CollectorUsageOp::LASTUSE:
+      OS << "lastuse";
+      return;
+    case Tcgen05CollectorUsageOp::FILL:
+      OS << "fill";
+      return;
+    case Tcgen05CollectorUsageOp::USE:
+      OS << "use";
+      return;
+    }
+  }
+  llvm_unreachable("printTcgen05CollectorUsageOp called with invalid value for "
+                   "immediate argument");
+}
+
 } // namespace nvvm
 } // namespace llvm
 #endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 4d4ffe93a8067..94a1aa3087377 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -4576,12 +4577,38 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
+    bool HasPrettyPrintedArgs =
+        isa<IntrinsicInst>(CI) &&
+        Intrinsic::hasPrettyPrintedArgs(CI->getIntrinsicID());
+
     ListSeparator LS;
-    for (unsigned op = 0, Eop = CI->arg_size(); op < Eop; ++op) {
-      Out << LS;
-      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op));
+    Function *CalledFunc = CI->getCalledFunction();
+    auto PrintArgComment = [&](unsigned ArgNo) {
+      const auto *ConstArg = dyn_cast<Constant>(CI->getArgOperand(ArgNo));
+      if (!ConstArg)
+        return;
+      std::string ArgComment;
+      raw_string_ostream ArgCommentStream(ArgComment);
+      Intrinsic::ID IID = CalledFunc->getIntrinsicID();
+      Intrinsic::printImmArg(IID, ArgNo, ArgCommentStream, ConstArg);
+      if (ArgComment.empty())
+        return;
+      Out << "/* " << ArgComment << " */ ";
+    };
+    if (HasPrettyPrintedArgs) {
+      for (unsigned ArgNo = 0, NumArgs = CI->arg_size(); ArgNo < NumArgs;
+           ++ArgNo) {
+        Out << LS;
+        PrintArgComment(ArgNo);
+        writeParamOperand(CI->getArgOperand(ArgNo), PAL.getParamAttrs(ArgNo));
+      }
+    } else {
+      for (unsigned ArgNo = 0, NumArgs = CI->arg_size(); ArgNo < NumArgs;
+           ++ArgNo) {
+        Out << LS;
+        writeParamOperand(CI->getArgOperand(ArgNo), PAL.getParamAttrs(ArgNo));
+      }
     }
-
     // Emit an ellipsis if this is a musttail call in a vararg function.  This
     // is only to aid readability, musttail calls forward varargs by default.
     if (CI->isMustTailCall() && CI->getParent() &&
@@ -5005,12 +5032,10 @@ void AssemblyWriter::printUseLists(const Function *F) {
 //===----------------------------------------------------------------------===//
 
 void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
-                     bool ShouldPreserveUseListOrder,
-                     bool IsForDebug) const {
+                     bool ShouldPreserveUseListOrder, bool IsForDebug) const {
   SlotTracker SlotTable(this->getParent());
   formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW,
-                   IsForDebug,
+  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printFunction(this);
 }
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 526800e217399..859689b9cf168 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsXCore.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
@@ -601,6 +602,12 @@ bool Intrinsic::isOverloaded(ID id) {
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
+bool Intrinsic::hasPrettyPrintedArgs(ID id){
+#define GET_INTRINSIC_PRETTY_PRINT_TABLE
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_PRETTY_PRINT_TABLE
+}
+
 /// Table of per-target intrinsic name tables.
 #define GET_INTRINSIC_TARGET_DATA
 #include "llvm/IR/IntrinsicImpl.inc"
@@ -1142,3 +1149,7 @@ Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
   assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
   return InterleaveIntrinsics[Factor - 2].Deinterleave;
 }
+
+#define GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
new file mode 100644
index 0000000000000..479de53dd90f2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: This sample test demonstrates the pretty print feature for NVPTX intrinsics
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+  ; CHECK-LABEL: define void @tcgen05_mma_fp16_cta1(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3)
+
+  ret void
+}
+
+define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+  ; CHECK-LABEL: define void @tcgen05_mma_f8f6f4_cta2(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3)
+
+  ret void
+}
+
+; This test verifies that printImmArg is safe to call on all constant arguments, but only prints comments for arguments that have pretty printing configured.
+define void @test_mixed_constants_edge_case(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor) {
+  ; CHECK-LABEL: define void @test_mixed_constants_edge_case(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, i32 3, i32 1, i32 0)
+
+  ret void
+}
+
+declare void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6), ptr addrspace(6), i64, i32, i1, i32, i32, i32)
diff --git a/llvm/test/TableGen/intrinsic-arginfo.td b/llvm/test/TableGen/intrinsic-arginfo.td
new file mode 100644
index 0000000000000..eab1f5e032bc3
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-arginfo.td
@@ -0,0 +1,71 @@
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s | FileCheck %s
+
+// Test ArgInfo property for pretty-printing intrinsic arguments.
+// This test verifies that TableGen generates the correct pretty-printing code
+// for intrinsics that use the ArgInfo property.
+
+include "llvm/IR/Intrinsics.td"
+
+// Simple intrinsic with two arguments that have ArgInfo.
+def int_dummy_foo_bar : DefaultAttrsIntrinsic<
+    [llvm_i32_ty],
+    [llvm_i32_ty,      // data
+     llvm_i32_ty,      // mode
+     llvm_i32_ty],     // stride
+    [IntrNoMem,
+     ImmArg<ArgIndex<1>>,
+     ArgInfo<ArgIndex<1>, [ArgName<"mode">, ImmArgPrinter<"printDummyMode">]>,
+     ArgInfo<ArgIndex<2>, [ArgName<"stride">]>]>;
+
+// A custom floating point add with rounding and sat mode.
+def int_my_fadd_f32 : DefaultAttrsIntrinsic<
+    [llvm_float_ty],
+    [llvm_float_ty,    // a
+     llvm_float_ty,    // b
+     llvm_i32_ty,      // rounding_mode
+     llvm_i1_ty],      // saturation_mode
+    [IntrNoMem,
+     ImmArg<ArgIndex<2>>,
+     ImmArg<ArgIndex<3>>,
+     ArgInfo<ArgIndex<2>, [ArgName<"rounding_mode">, ImmArgPrinter<"printRoundingMode">]>,
+     ArgInfo<ArgIndex<3>, [ArgName<"saturation_mode">]>]>;
+
+// CHECK: #ifdef GET_INTRINSIC_PRETTY_PRINT_TABLE
+// CHECK-NEXT: static constexpr uint8_t PPTable[] = {
+
+// CHECK: #endif // GET_INTRINSIC_PRETTY_PRINT_TABLE
+
+// CHECK: #ifdef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+// CHECK: void Intrinsic::printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS, const Constant *ImmArgVal) {
+
+// CHECK: case dummy_foo_bar:
+// CHECK-NEXT: switch (ArgIdx) {
+
+// CHECK-NEXT: case 1:
+// CHECK-NEXT: OS << "mode=";
+// CHECK-NEXT: printDummyMode(OS, ImmArgVal);
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: case 2:
+// CHECK-NEXT: OS << "stride=";
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: }
+// CHECK-NEXT: break;
+
+// CHECK: case my_fadd_f32:
+// CHECK-NEXT: switch (ArgIdx) {
+
+// CHECK-NEXT: case 2:
+// CHECK-NEXT: OS << "rounding_mode=";
+// CHECK-NEXT: printRoundingMode(OS, ImmArgVal);
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: case 3:
+// CHECK-NEXT: OS << "saturation_mode=";
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: }
+// CHECK-NEXT: break;
+
+// CHECK: #endif // GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index ff894853b9771..228969ab37f85 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -449,6 +449,29 @@ void CodeGenIntrinsic::setProperty(const Record *R) {
     int64_t Lower = R->getValueAsInt("Lower");
     int64_t Upper = R->getValueAsInt("Upper");
     addArgAttribute(ArgNo, Range, Lower, Upper);
+  } else if (R->isSubClassOf("ArgInfo")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    if (ArgNo < 1)
+      PrintFatalError(R->getLoc(),
+                      "ArgInfo requires ArgNo >= 1 (0 is return value)");
+    const ListInit *Properties = R->getValueAsListInit("Properties");
+    StringRef ArgName;
+    StringRef FuncName;
+
+    for (const Init *PropInit : Properties->getElements()) {
+      if (const auto *PropDef = dyn_cast<DefInit>(PropInit)) {
+        const Record *PropRec = PropDef->getDef();
+
+        if (PropRec->isSubClassOf("ArgName"))
+          ArgName = PropRec->getValueAsString("Name");
+        else if (PropRec->isSubClassOf("ImmArgPrinter"))
+          FuncName = PropRec->getValueAsString("FuncName");
+        else
+          PrintFatalError(PropRec->getLoc(),
+                          "Unknown ArgProperty type: " + PropRec->getName());
+      }
+    }
+    addPrettyPrintFunction(ArgNo - 1, ArgName, FuncName);
   } else {
     llvm_unreachable("Unknown property!");
   }
@@ -476,3 +499,16 @@ void CodeGenIntrinsic::addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V,
     ArgumentAttributes.resize(Idx + 1);
   ArgumentAttributes[Idx].emplace_back(AK, V, V2);
 }
+
+void CodeGenIntrinsic::addPrettyPrintFunction(unsigned ArgIdx,
+                                              StringRef ArgName,
+                                              StringRef FuncName) {
+  auto It = llvm::find_if(PrettyPrintFunctions, [ArgIdx](const auto &Info) {
+    return Info.ArgIdx == ArgIdx;
+  });
+  if (It != PrettyPrintFunctions.end())
+    PrintFatalError(TheDef->getLoc(), "ArgInfo for argument " + Twine(ArgIdx) +
+                                          " is already defined as '" +
+                                          It->FuncName + "'");
+  PrettyPrintFunctions.emplace_back(ArgIdx, ArgName, FuncName);
+}
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 15e803c4feba1..6ac6f734326d8 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -152,6 +152,22 @@ struct CodeGenIntrinsic {
   void addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V = 0,
                        uint64_t V2 = 0);
 
+  /// Structure to store pretty print and argument information.
+  struct PrettyPrintArgInfo {
+    unsigned ArgIdx;
+    StringRef ArgName;
+    StringRef FuncName;
+
+    PrettyPrintArgInfo(unsigned Idx, StringRef Name, StringRef Func)
+        : ArgIdx(Idx), ArgName(Name), FuncName(Func) {}
+  };
+
+  /// Vector that stores ArgInfo (ArgIndex, ArgName, FunctionName).
+  SmallVector<PrettyPrintArgInfo> PrettyPrintFunctions;
+
+  void addPrettyPrintFunction(unsigned ArgIdx, StringRef ArgName,
+                              StringRef FuncName);
+
   bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
 
   /// Goes through all IntrProperties that have IsDefault value set and sets
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 452d2b08f25c3..3ac23185ef91c 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -60,8 +60,16 @@ class IntrinsicEmitter {
                                 raw_ostream &OS);
   void EmitIntrinsicToOverloadTable(const CodeGenIntrinsicTable &Ints,
                                     raw_ostream &OS);
+  void EmitIntrinsicToPrettyPrintTable(const CodeGenIntrinsicTable &Ints,
+                                       raw_ostream &OS);
+  void EmitIntrinsicBitTable(
+      const CodeGenIntrinsicTable &Ints, raw_ostream &OS, StringRef Guard,
+      StringRef TableName, StringRef Comment,
+      function_ref<bool(const CodeGenIntrinsic &Int)> GetProperty);
   void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
   void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
+  void EmitPrettyPrintArguments(const CodeGenIntrinsicTable &Ints,
+                                raw_ostream &OS);
   void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints,
                                  bool IsClang, raw_ostream &OS);
 };
@@ -109,6 +117,12 @@ void IntrinsicEmitter::run(raw_ostream &OS, bool Enums) {
     // Emit the intrinsic parameter attributes.
     EmitAttributes(Ints, OS);
 
+    // Emit the intrinsic ID -> pretty print table.
+    EmitIntrinsicToPrettyPrintTable(Ints, OS);
+
+    // Emit Pretty Print attribute.
+    EmitPrettyPrintArguments(Ints, OS);
+
     // Emit code to translate Clang builtins into LLVM intrinsics.
     EmitIntrinsicToBuiltinMap(Ints, true, OS);
 
@@ -240,6 +254,29 @@ static constexpr IntrinsicTargetInfo TargetInfos[] = {
 )";
 }
 
+/// Helper function to emit a bit table for intrinsic properties.
+/// This is used for both overload and pretty print bit tables.
+void IntrinsicEmitter::EmitIntrinsicBitTable(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS, StringRef Guard,
+    StringRef TableName, StringRef Comment,
+    function_ref<bool(const CodeGenIntrinsic &Int)> GetProperty) {
+  OS << formatv("// {}\n", Comment);
+  OS << formatv("#ifdef {}\n", Guard);
+  OS << formatv("static constexpr uint8_t {}[] = {{\n", TableName);
+  OS << "  0\n  ";
+  for (auto [I, Int] : enumerate(Ints)) {
+    // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
+    size_t Idx = I + 1;
+    if (Idx % 8 == 0)
+      OS << ",\n  0";
+    if (GetProperty(Int))
+      OS << " | (1<<" << Idx % 8 << ')';
+  }
+  OS << "\n};\n\n";
+  OS << formatv("return ({}[id/8] & (1 << (id%8))) != 0;\n", TableName);
+  OS << formatv("#endif // {}\n\n", Guard);
+}
+
 void IntrinsicEmitter::EmitIntrinsicToNameTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
   // Built up a table of the intrinsic names.
@@ -276,24 +313,10 @@ static constexpr unsigned IntrinsicNameOffsetTable[] = {
 
 void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
-  OS << R"(// Intrinsic ID to overload bitset.
-#ifdef GET_INTRINSIC_OVERLOAD_TABLE
-static constexpr uint8_t OTable[] = {
-  0
-  )";
-  for (auto [I, Int] : enumerate(Ints)) {
-    // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
-    size_t Idx = I + 1;
-
-    if (Idx % 8 == 0)
-      OS << ",\n  0";
-    if (Int.isOverloaded)
-      OS << " | (1<<" << Idx % 8 << ')';
-  }
-  OS << "\n};\n\n";
-  // OTable contains a true bit at the position if the intrinsic is overloaded.
-  OS << "return (OTable[id/8] & (1 << (id%8))) != 0;\n";
-  OS << "#endif\n\n";
+  EmitIntrinsicBitTable(
+      Ints, OS, "GET_INTRINSIC_OVERLOAD_TABLE", "OTable",
+      "Intrinsic ID to overload bitset.",
+      [](const CodeGenIntrinsic &Int) { return Int.isOverloaded; });
 }
 
 using TypeSigTy = SmallVector<unsigned char>;
@@ -809,6 +832,52 @@ AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {{
                 NoFunctionAttrsID);
 }
 
+void IntrinsicEmitter::EmitIntrinsicToPrettyPrintTable(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
+  EmitIntrinsicBitTable(Ints, OS, "GET_INTRINSIC_PRETTY_PRINT_TABLE", "PPTable",
+                        "Intrinsic ID to pretty print bitset.",
+                        [](const CodeGenIntrinsic &Int) {
+                          return !Int.PrettyPrintFunctions.empty();
+                        });
+}
+
+void IntrinsicEmitter::EmitPrettyPrintArguments(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
+  OS << R"(
+#ifdef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+void Intrinsic::printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS, const Constant *ImmArgVal) {
+  using namespace Intrinsic;
+  switch (IID) {
+)";
+
+  for (const CodeGenIntrinsic &Int : Ints) {
+    if (Int.PrettyPrintFunctions.empty())
+      continue;
+
+    OS << "  case " << Int.EnumName << ":\n";
+    OS << "    switch (ArgIdx) {\n";
+    for (const auto [ArgIdx, ArgName, FuncName] : Int.PrettyPrintFunctions) {
+      OS << "    case " << ArgIdx << ":\n";
+      OS << "      OS << \"" << ArgName << "=\";\n";
+      if (!FuncName.empty()) {
+        OS << "      ";
+        if (!Int.TargetPrefix.empty())
+          OS << Int.TargetPrefix << "::";
+        OS << FuncName << "(OS, ImmArgVal);\n";
+      }
+      OS << "      return;\n";
+    }
+    OS << "    }\n";
+    OS << "    break;\n";
+  }
+  OS << R"(  default:
+    break;
+  }
+}
+#endif // GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+)";
+}
+
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
     const CodeGenIntrinsicTable &Ints, bool IsClang, raw_ostream &OS) {
   StringRef CompilerName = IsClang ? "Clang" : "MS";

From 6f5c8fe1c1d24604d3328b82f5a1ed348e59326f Mon Sep 17 00:00:00 2001
From: Govind Malasani <145235389+gmalasan@users.noreply.github.com>
Date: Mon, 17 Nov 2025 12:01:44 -0500
Subject: [PATCH 043/105] [MLIR][SparseTensor] Dense Outer Loop Ordering
 Strategy (#160168)

This PR builds upon the infrastructure set up for Sparse Tensor Loop
Ordering Heuristics (#154656) by adding a preference to have dense loops
outer and sparse loops inner.

As always I'd love to get feedback and know if there's any other
direction to go with this work that might be better.
---
 .../Dialect/SparseTensor/Transforms/Passes.h  |  5 +-
 .../Dialect/SparseTensor/Transforms/Passes.td |  4 +-
 .../Transforms/Utils/IterationGraphSorter.cpp | 72 ++++++++++++++++++-
 3 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index af64370a62dd7..419ecda80e9a5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -58,9 +58,10 @@ enum class SparseEmitStrategy {
 namespace sparse_tensor {
 
 /// Defines a strategy for loop ordering during sparse code generation.
+/// See Passes.td for strategy descriptions.
 enum class LoopOrderingStrategy : unsigned {
-  kDefault, ///< Default strategy (eagerly selects last loop in topological
-            ///< sort).
+  kDefault,
+  kDenseOuter,
 };
 
 } // namespace sparse_tensor
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 75e77d67db1b3..0b8562e484f51 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -85,7 +85,9 @@ def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
        "mlir::sparse_tensor::LoopOrderingStrategy::kDefault",
        "Set the loop ordering strategy for sparse code generation", [{llvm::cl::values(
          clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDefault, "default",
-                    "Default strategy (eagerly selects last loop in topological sort)"))}]>,
+                    "Default strategy (eagerly selects last loop in topological sort)"),
+         clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDenseOuter, "dense-outer",
+                    "Prefer dense, then compressed, then singleton dimensions outermost"))}]>,
   ];
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index ffa8b402e0b6b..99048034b4f0c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -80,6 +80,53 @@ inline static bool includesDenseOutput(SortMask mask) {
   return includesAny(mask, SortMask::kIncludeDenseOutput);
 }
 
+/// Returns a sparsity rank for loop ordering: lower values indicate
+/// dimensions that should be placed in outer loops.
+/// 0 = Dense, 1 = Compressed, 2 = Singleton, 3 = Other/Unknown.
+static unsigned getLoopSparsityRank(unsigned loop, ArrayRef<Value> allTensors,
+                                    ArrayRef<AffineMap> allMaps) {
+  // Start with highest rank.
+  unsigned minRank = 3;
+
+  for (auto [tensor, map] : llvm::zip(allTensors, allMaps)) {
+    // Check if this loop accesses this tensor.
+    bool loopAccessesTensor = false;
+    unsigned tensorDim = 0;
+    for (AffineExpr expr : map.getResults()) {
+      if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+        if (dimExpr.getPosition() == loop) {
+          loopAccessesTensor = true;
+          break;
+        }
+      }
+      tensorDim++;
+    }
+
+    if (loopAccessesTensor) {
+      const auto enc = getSparseTensorEncoding(tensor.getType());
+      if (!enc) {
+        // Dense tensor - lowest rank.
+        return 0;
+      } else {
+        // Sparse tensor - check the level type for this dimension.
+        auto lvlTypes = enc.getLvlTypes();
+        if (tensorDim < lvlTypes.size()) {
+          auto lvlType = lvlTypes[tensorDim];
+          if (isDenseLT(lvlType)) {
+            return 0; // Dense level.
+          } else if (isCompressedLT(lvlType)) {
+            minRank = std::min(minRank, 1u); // Compressed level.
+          } else if (isSingletonLT(lvlType)) {
+            minRank = std::min(minRank, 2u); // Singleton level.
+          }
+        }
+      }
+    }
+  }
+
+  return minRank;
+}
+
 AffineMap IterationGraphSorter::topoSort() {
   // The sorted result will put the first Reduction iterator to the
   // latest possible position.
@@ -107,10 +154,33 @@ AffineMap IterationGraphSorter::topoSort() {
     case sparse_tensor::LoopOrderingStrategy::kDefault:
       src = it.back();
       break;
+    case sparse_tensor::LoopOrderingStrategy::kDenseOuter: {
+      // Prefer dense, then compressed, then singleton dimensions outermost.
+      // Create combined tensor and map lists for analysis.
+      SmallVector<Value> allTensors = ins;
+      allTensors.push_back(out);
+      SmallVector<AffineMap> allMaps = loop2InsLvl;
+      allMaps.push_back(loop2OutLvl);
+
+      // Find loop with minimum (lowest) sparsity rank.
+      unsigned minLoop = it[0];
+      unsigned minRank = getLoopSparsityRank(minLoop, allTensors, allMaps);
+
+      for (auto candidateLoop : it) {
+        unsigned rank = getLoopSparsityRank(candidateLoop, allTensors, allMaps);
+        if (rank < minRank || (rank == minRank && candidateLoop < minLoop)) {
+          minLoop = candidateLoop;
+          minRank = rank;
+        }
+      }
+      src = minLoop;
+      break;
+    }
     }
 
     loopOrder.push_back(src);
-    it.pop_back();
+    // Remove the selected loop from the worklist.
+    it.erase(std::find(it.begin(), it.end(), src));
     // Update in-degree, and push 0-degree node into worklist.
     for (unsigned dst = 0; dst < numLoops; dst++) {
       if (itGraph[src][dst] && --inDegree[dst] == 0) {

From 0c8464330a510e0c3b629883ed1acd81da17da5d Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 17 Nov 2025 09:09:46 -0800
Subject: [PATCH 044/105] [CIR] Upstream handling for BaseToDerived casts
 (#167769)

Upstream handling for BaseToDerived casts, adding the
cir.base_class_addr operation and lowering to LLVM IR.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 56 ++++++++++-
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         | 13 +++
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 19 ++++
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 18 +++-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    | 15 ++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  5 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 35 +++++++
 clang/test/CIR/CodeGen/base-to-derived.cpp    | 97 +++++++++++++++++++
 8 files changed, 254 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/base-to-derived.cpp

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2124b1dc62a81..7b987ea49bf97 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3386,6 +3386,10 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
     cannot be known by the operation, and that information affects how the
     operation is lowered.
 
+    The validity of the relationship of derived and base cannot yet be verified.
+    If the target class is not a valid base class for the object, the behavior
+    is undefined.
+
     Example:
     ```c++
     struct Base { };
@@ -3399,8 +3403,6 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
     ```
   }];
 
-  // The validity of the relationship of derived and base cannot yet be
-  // verified, currently not worth adding a verifier.
   let arguments = (ins
     Arg<CIR_PointerType, "derived class pointer", [MemRead]>:$derived_addr,
     IndexAttr:$offset, UnitAttr:$assume_not_null);
@@ -3414,6 +3416,56 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DerivedClassAddrOp
+//===----------------------------------------------------------------------===//
+
+def CIR_DerivedClassAddrOp : CIR_Op<"derived_class_addr"> {
+  let summary = "Get the derived class address for a class/struct";
+  let description = [{
+    The `cir.derived_class_addr` operaration gets the address of a particular
+    derived class given a non-virtual base class pointer. The offset in bytes
+    of the base class must be passed in, similar to `cir.base_class_addr`, but
+    going into the other direction. This means lowering to a negative offset.
+
+    The operation contains a flag for whether or not the operand may be nullptr.
+    That depends on the context and cannot be known by the operation, and that
+    information affects how the operation is lowered.
+
+    The validity of the relationship of derived and base cannot yet be verified.
+    If the target class is not a valid derived class for the object, the
+    behavior is undefined.
+
+    Example:
+    ```c++
+    class A {};
+    class B : public A {};
+
+    B *getAsB(A *a) {
+      return static_cast<B*>(a);
+    }
+    ```
+
+    leads to
+    ```mlir
+      %2 = cir.load %0 : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+      %3 = cir.base_class_addr %2 : !cir.ptr<!rec_B> [0] -> !cir.ptr<!rec_A>
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<CIR_PointerType, "base class pointer", [MemRead]>:$base_addr,
+    IndexAttr:$offset, UnitAttr:$assume_not_null);
+
+  let results = (outs Res<CIR_PointerType, "">:$derived_addr);
+
+  let assemblyFormat = [{
+      $base_addr `:` qualified(type($base_addr))
+      (`nonnull` $assume_not_null^)?
+      ` ` `[` $offset `]` `->` qualified(type($derived_addr)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexCreateOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a391d7e70ace7..5ab1d0e05cf8a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -405,6 +405,19 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return Address(baseAddr, destType, addr.getAlignment());
   }
 
+  Address createDerivedClassAddr(mlir::Location loc, Address addr,
+                                 mlir::Type destType, unsigned offset,
+                                 bool assumeNotNull) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    cir::PointerType ptrTy = getPointerTo(destType);
+    auto derivedAddr =
+        cir::DerivedClassAddrOp::create(*this, loc, ptrTy, addr.getPointer(),
+                                        mlir::APInt(64, offset), assumeNotNull);
+    return Address(derivedAddr, destType, addr.getAlignment());
+  }
+
   mlir::Value createVTTAddrPoint(mlir::Location loc, mlir::Type retTy,
                                  mlir::Value addr, uint64_t offset) {
     return cir::VTTAddrPointOp::create(*this, loc, retTy,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index a8296782ebc40..89c4696b9da94 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -1110,6 +1110,25 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase,
   }
 }
 
+Address CIRGenFunction::getAddressOfDerivedClass(
+    mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+    llvm::iterator_range<CastExpr::path_const_iterator> path,
+    bool nullCheckValue) {
+  assert(!path.empty() && "Base path should not be empty!");
+
+  QualType derivedTy = getContext().getCanonicalTagType(derived);
+  mlir::Type derivedValueTy = convertType(derivedTy);
+  CharUnits nonVirtualOffset =
+      cgm.computeNonVirtualBaseClassOffset(derived, path);
+
+  // Note that in OG, no offset (nonVirtualOffset.getQuantity() == 0) means it
+  // just gives the address back. In CIR a `cir.derived_class` is created and
+  // made into a nop later on during lowering.
+  return builder.createDerivedClassAddr(loc, baseAddr, derivedValueTy,
+                                        nonVirtualOffset.getQuantity(),
+                                        /*assumeNotNull=*/!nullCheckValue);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index d35bb0af0de14..681a801cd7d81 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1301,7 +1301,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_ToUnion:
-  case CK_BaseToDerived:
   case CK_ObjCObjectLValueCast:
   case CK_VectorSplat:
   case CK_ConstructorConversion:
@@ -1336,6 +1335,7 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
                                   lv.getAddress().getAlignment()),
                           e->getType(), lv.getBaseInfo());
   }
+
   case CK_LValueBitCast: {
     // This must be a reinterpret_cast (or c-style equivalent).
     const auto *ce = cast<ExplicitCastExpr>(e);
@@ -1387,6 +1387,22 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
     return makeAddrLValue(baseAddr, e->getType(), lv.getBaseInfo());
   }
 
+  case CK_BaseToDerived: {
+    const auto *derivedClassDecl = e->getType()->castAsCXXRecordDecl();
+    LValue lv = emitLValue(e->getSubExpr());
+
+    // Perform the base-to-derived conversion
+    Address derived = getAddressOfDerivedClass(
+        getLoc(e->getSourceRange()), lv.getAddress(), derivedClassDecl,
+        e->path(), /*NullCheckValue=*/false);
+    // C++11 [expr.static.cast]p2: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    assert(!cir::MissingFeatures::opTBAA());
+    return makeAddrLValue(derived, e->getType(), lv.getBaseInfo());
+  }
+
   case CK_ZeroToOCLOpaqueType:
     llvm_unreachable("NULL to OpenCL opaque type lvalue cast is not valid");
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index ce95607bd468d..3b0977d213325 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1972,6 +1972,20 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return builder.createIntToPtr(middleVal, destCIRTy);
   }
 
+  case CK_BaseToDerived: {
+    const CXXRecordDecl *derivedClassDecl = destTy->getPointeeCXXRecordDecl();
+    assert(derivedClassDecl && "BaseToDerived arg isn't a C++ object pointer!");
+    Address base = cgf.emitPointerWithAlignment(subExpr);
+    Address derived = cgf.getAddressOfDerivedClass(
+        cgf.getLoc(ce->getSourceRange()), base, derivedClassDecl, ce->path(),
+        cgf.shouldNullCheckClassCastValue(ce));
+
+    // C++11 [expr.static.cast]p11: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    return cgf.getAsNaturalPointerTo(derived, ce->getType()->getPointeeType());
+  }
   case CK_UncheckedDerivedToBase:
   case CK_DerivedToBase: {
     // The EmitPointerWithAlignment path does this fine; just discard
@@ -1979,7 +1993,6 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return cgf.getAsNaturalPointerTo(cgf.emitPointerWithAlignment(ce),
                                      ce->getType()->getPointeeType());
   }
-
   case CK_Dynamic: {
     Address v = cgf.emitPointerWithAlignment(subExpr);
     const auto *dce = cast<CXXDynamicCastExpr>(ce);
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 2dddf26981105..b22bf2d87fc10 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -823,6 +823,11 @@ class CIRGenFunction : public CIRGenTypeCache {
       llvm::iterator_range<CastExpr::path_const_iterator> path,
       bool nullCheckValue, SourceLocation loc);
 
+  Address getAddressOfDerivedClass(
+      mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+      llvm::iterator_range<CastExpr::path_const_iterator> path,
+      bool nullCheckValue);
+
   /// Return the VTT parameter that should be passed to a base
   /// constructor/destructor with virtual bases.
   /// FIXME: VTTs are Itanium ABI-specific, so the definition should move
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index d88a4ad76f27b..92434d730eb31 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1360,6 +1360,41 @@ mlir::LogicalResult CIRToLLVMBaseClassAddrOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMDerivedClassAddrOpLowering::matchAndRewrite(
+    cir::DerivedClassAddrOp derivedClassOp, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  const mlir::Type resultType =
+      getTypeConverter()->convertType(derivedClassOp.getType());
+  mlir::Value baseAddr = adaptor.getBaseAddr();
+  // The offset is set in the operation as an unsigned value, but it must be
+  // applied as a negative offset.
+  int64_t offsetVal = -(adaptor.getOffset().getZExtValue());
+  if (offsetVal == 0) {
+    // If the offset is zero, we can just return the base address,
+    rewriter.replaceOp(derivedClassOp, baseAddr);
+    return mlir::success();
+  }
+  llvm::SmallVector<mlir::LLVM::GEPArg, 1> offset = {offsetVal};
+  mlir::Type byteType = mlir::IntegerType::get(resultType.getContext(), 8,
+                                               mlir::IntegerType::Signless);
+  if (derivedClassOp.getAssumeNotNull()) {
+    rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(
+        derivedClassOp, resultType, byteType, baseAddr, offset,
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
+  } else {
+    mlir::Location loc = derivedClassOp.getLoc();
+    mlir::Value isNull = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, baseAddr,
+        mlir::LLVM::ZeroOp::create(rewriter, loc, baseAddr.getType()));
+    mlir::Value adjusted =
+        mlir::LLVM::GEPOp::create(rewriter, loc, resultType, byteType, baseAddr,
+                                  offset, mlir::LLVM::GEPNoWrapFlags::inbounds);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::SelectOp>(derivedClassOp, isNull,
+                                                      baseAddr, adjusted);
+  }
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
     cir::ATanOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/test/CIR/CodeGen/base-to-derived.cpp b/clang/test/CIR/CodeGen/base-to-derived.cpp
new file mode 100644
index 0000000000000..af9aa0ffd19c1
--- /dev/null
+++ b/clang/test/CIR/CodeGen/base-to-derived.cpp
@@ -0,0 +1,97 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+class A {
+    int a;
+};
+
+class B {
+    int b;
+public:
+    A *getAsA();
+};
+
+class X : public A, public B {
+    int x;
+};
+
+X *castAtoX(A *a) {
+  return static_cast<X*>(a);
+}
+
+// CIR: cir.func {{.*}} @_Z8castAtoXP1A(%[[ARG0:.*]]: !cir.ptr<!rec_A> {{.*}})
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>, ["a", init]
+// CIR:   cir.store %[[ARG0]], %[[A_ADDR]] : !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>
+// CIR:   %[[A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[A]] : !cir.ptr<!rec_A> [0] -> !cir.ptr<!rec_X>
+
+// Note: Because the offset is 0, a null check is not needed.
+
+// LLVM: define {{.*}} ptr @_Z8castAtoXP1A(ptr %[[ARG0:.*]])
+// LLVM:   %[[A_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG0]], ptr %[[A_ADDR]]
+// LLVM:   %[[X:.*]] = load ptr, ptr %[[A_ADDR]]
+
+// OGCG: define {{.*}} ptr @_Z8castAtoXP1A(ptr {{.*}} %[[ARG0:.*]])
+// OGCG:   %[[A_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[A_ADDR]]
+// OGCG:   %[[X:.*]] = load ptr, ptr %[[A_ADDR]]
+
+X *castBtoX(B *b) {
+  return static_cast<X*>(b);
+}
+
+// CIR: cir.func {{.*}} @_Z8castBtoXP1B(%[[ARG0:.*]]: !cir.ptr<!rec_B> {{.*}})
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["b", init]
+// CIR:   cir.store %[[ARG0]], %[[B_ADDR]] : !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>
+// CIR:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.ptr<!rec_B>>, !cir.ptr<!rec_B>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[B]] : !cir.ptr<!rec_B> [4] -> !cir.ptr<!rec_X>
+
+// LLVM: define {{.*}} ptr @_Z8castBtoXP1B(ptr %[[ARG0:.*]])
+// LLVM:   %[[B_ADDR:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   store ptr %[[ARG0]], ptr %[[B_ADDR]], align 8
+// LLVM:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]], align 8
+// LLVM:   %[[IS_NULL:.*]] = icmp eq ptr %[[B]], null
+// LLVM:   %[[B_NON_NULL:.*]] = getelementptr inbounds i8, ptr %[[B]], i32 -4
+// LLVM:   %[[X:.*]] = select i1 %[[IS_NULL]], ptr %[[B]], ptr %[[B_NON_NULL]]
+
+// OGCG: define {{.*}} ptr @_Z8castBtoXP1B(ptr {{.*}} %[[ARG0:.*]])
+// OGCG: entry:
+// OGCG:   %[[B_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// OGCG:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// OGCG:   %[[IS_NULL:.*]] = icmp eq ptr %[[B]], null
+// OGCG:   br i1 %[[IS_NULL]], label %[[LABEL_NULL:.*]], label %[[LABEL_NOTNULL:.*]]
+// OGCG: [[LABEL_NOTNULL]]:
+// OGCG:   %[[B_NON_NULL:.*]] = getelementptr inbounds i8, ptr %[[B]], i64 -4
+// OGCG:   br label %[[LABEL_END:.*]]
+// OGCG: [[LABEL_NULL]]:
+// OGCG:   br label %[[LABEL_END:.*]]
+// OGCG: [[LABEL_END]]:
+// OGCG:   %[[X:.*]] = phi ptr [ %[[B_NON_NULL]], %[[LABEL_NOTNULL]] ], [ null, %[[LABEL_NULL]] ]
+
+X &castBReftoXRef(B &b) {
+  return static_cast<X&>(b);
+}
+
+// CIR: cir.func {{.*}} @_Z14castBReftoXRefR1B(%[[ARG0:.*]]: !cir.ptr<!rec_B> {{.*}})
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["b", init, const]
+// CIR:   cir.store %[[ARG0]], %[[B_ADDR]] : !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>
+// CIR:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.ptr<!rec_B>>, !cir.ptr<!rec_B>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[B]] : !cir.ptr<!rec_B> nonnull [4] -> !cir.ptr<!rec_X>
+
+// LLVM: define {{.*}} ptr @_Z14castBReftoXRefR1B(ptr %[[ARG0:.*]])
+// LLVM:   %[[B_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// LLVM:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// LLVM:   %[[X:.*]] = getelementptr inbounds i8, ptr %[[B]], i32 -4
+
+// OGCG: define {{.*}} ptr @_Z14castBReftoXRefR1B(ptr {{.*}} %[[ARG0:.*]])
+// OGCG:   %[[B_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// OGCG:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// OGCG:   %[[X:.*]] = getelementptr inbounds i8, ptr %[[B]], i64 -4

From 72b02c7b376f211a6fffd5524e5db4c006ec6704 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 17 Nov 2025 09:13:55 -0800
Subject: [PATCH 045/105] [AMDGPU] Fix layering violations in AMDGPUMCExpr.cpp.
 NFC (#168242)

AMDGPUMCExpr lives in the MC layer it should not depend on Function.h or
GCNSubtarget.h

Move the function that needed GCNSubtarget to the one file that called
it.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 30 +++++++++++++++++--
 .../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp      | 26 ----------------
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h |  5 ----
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 29f8f9bc8b54c..8bfdbb7c5c310 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                                    const MCExpr *NumVGPRs,
+                                    unsigned DynamicVGPRBlockSize,
+                                    const GCNSubtarget &STM, MCContext &Ctx) {
+  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
+  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+  unsigned Generation = STM.getGeneration();
+
+  auto CreateExpr = [&Ctx](unsigned Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
+                              {CreateExpr(MaxWaves), CreateExpr(Granule),
+                               CreateExpr(TargetTotalNumVGPRs),
+                               CreateExpr(Generation), CreateExpr(InitOcc),
+                               NumSGPRs, NumVGPRs},
+                              Ctx);
+}
+
 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
     return;
@@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
                         MaxWaves, MFI.getDynamicVGPRBlockSize())});
       uint64_t NumSGPRsForWavesPerEU = std::max(
           {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
-      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+      const MCExpr *OccupancyExpr = createOccupancy(
           STM.getOccupancyWithWorkGroupSizes(*MF).second,
           MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
           MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
@@ -1270,7 +1296,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
 
-  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
+  ProgInfo.Occupancy = createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
       MFI->getDynamicVGPRBlockSize(), STM, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index c27be0250e386..093c85ecabab0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCExpr.h"
-#include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
   return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
 }
 
-/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
-///
-/// Remove dependency on GCNSubtarget and depend only only the necessary values
-/// for said occupancy computation. Should match computeOccupancy implementation
-/// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
-    unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
-    unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
-  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
-  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
-  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
-  unsigned Generation = STM.getGeneration();
-
-  auto CreateExpr = [&Ctx](unsigned Value) {
-    return MCConstantExpr::create(Value, Ctx);
-  };
-
-  return create(AGVK_Occupancy,
-                {CreateExpr(MaxWaves), CreateExpr(Granule),
-                 CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
-                 CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
-                Ctx);
-}
-
 const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
                                             MCContext &Ctx) {
   assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 246a3f88ebce4..bf7b40b1851da 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -98,11 +98,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
     return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
   }
 
-  static const AMDGPUMCExpr *
-  createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
-                  const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
-                  const GCNSubtarget &STM, MCContext &Ctx);
-
   static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
                                        MCContext &Ctx);
 

From fb2b1387fb73a390d5d3e033277ed328c20553c3 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 17 Nov 2025 09:14:19 -0800
Subject: [PATCH 046/105] [CIR] Handle __asm labels on function declarations
 (#168149)

This updates the CIR direct callee builder code to handle the case of
calls to functions that were declared with an assembly label using
`__asm`. The implementation doesn't actually have any explicit handling
of the AsmLabelAttr. It is handled by the name mangler.

See https://reviews.llvm.org/D137073 and
https://reviews.llvm.org/D134362 for details on how this was implemented
in classic codegen. The test here is copied from
https://reviews.llvm.org/D134362 because the test in
https://reviews.llvm.org/D134362 requires a target that isn't yet
supported in CIR.
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  6 +-
 .../CIR/CodeGen/asm-label-inline-builtins.c   | 58 +++++++++++++++++++
 2 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/asm-label-inline-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 681a801cd7d81..8607558c1cf7d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1798,11 +1798,7 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
   const auto *fd = cast<FunctionDecl>(gd.getDecl());
 
   if (unsigned builtinID = fd->getBuiltinID()) {
-    if (fd->getAttr<AsmLabelAttr>()) {
-      cgm.errorNYI("AsmLabelAttr");
-    }
-
-    StringRef ident = fd->getName();
+    StringRef ident = cgm.getMangledName(gd);
     std::string fdInlineName = (ident + ".inline").str();
 
     bool isPredefinedLibFunction =
diff --git a/clang/test/CIR/CodeGen/asm-label-inline-builtins.c b/clang/test/CIR/CodeGen/asm-label-inline-builtins.c
new file mode 100644
index 0000000000000..24c9a32e7c41d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/asm-label-inline-builtins.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple x86_64 -fclangir -emit-cir -disable-llvm-passes -o %t-cir.cir %s
+// RUN: FileCheck --input-file=%t-cir.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64 -fclangir -emit-llvm -disable-llvm-passes -o %t-cir.ll %s
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o %t.ll %s
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+
+// Verifies that clang-generated *.inline carry the same name at call and callee
+// site, in spite of asm labels.
+
+typedef struct _IO_FILE FILE;
+extern FILE *stdout;
+extern int vprintf (const char *__restrict __format, __builtin_va_list __arg);
+extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
+      const char *__restrict __format, __builtin_va_list __ap);
+extern int __vprintf_chk (int __flag, const char *__restrict __format,
+     __builtin_va_list __ap);
+
+extern __typeof (vprintf) vprintf __asm ("__vprintfieee128");
+extern __typeof (__vfprintf_chk) __vfprintf_chk __asm ("__vfprintf_chkieee128");
+extern __typeof (__vprintf_chk) __vprintf_chk __asm ("__vprintf_chkieee128");
+
+extern __inline __attribute__ ((__always_inline__)) __attribute__ ((__gnu_inline__)) __attribute__ ((__artificial__)) int
+vprintf (const char *__restrict __fmt, __builtin_va_list __ap)
+{
+  return __vfprintf_chk (stdout, 2 - 1, __fmt, __ap);
+}
+
+void test(const char *fmt, __builtin_va_list ap) {
+  vprintf(fmt, ap);
+}
+
+// CIR: cir.func internal private @__vprintfieee128.inline({{.*}}) -> !s32i inline(always)
+// CIR:   cir.call @__vfprintf_chkieee128(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
+//
+// CIR: cir.func {{.*}} @test({{.*}})
+// CIR:   cir.call @__vprintfieee128.inline(%{{.*}}, %{{.*}})
+
+
+// LLVM: define internal i32 @__vprintfieee128.inline({{.*}}) #[[ALWAYS_INLINE_ATTR:.*]] {
+// LLVM:   call i32 @__vfprintf_chkieee128(ptr %{{.*}}, i32 1, ptr %{{.*}}, ptr %{{.*}})
+//
+// LLVM: define {{.*}} void @test{{.*}}
+// LLVM:   call i32 @__vprintfieee128.inline(ptr %{{.*}}, ptr %{{.*}})
+//
+// LLVM: attributes #[[ALWAYS_INLINE_ATTR]] = { alwaysinline }
+
+// Note: OGCG emits these in the opposite order, but the content is the same.
+
+
+// OGCG: define {{.*}} void @test{{.*}}
+// OGCG:   call i32 @__vprintfieee128.inline(ptr noundef %{{.*}}, ptr noundef %{{.*}})
+//
+// OGCG: define internal i32 @__vprintfieee128.inline({{.*}}) #[[ALWAYS_INLINE_ATTR:.*]] {
+// OGCG:   call i32 @__vfprintf_chkieee128(ptr noundef %{{.*}}, i32 noundef 1, ptr noundef %{{.*}}, ptr noundef %{{.*}})
+//
+// OGCG: attributes #[[ALWAYS_INLINE_ATTR]] = { alwaysinline {{.*}} }

From 8c674f04aa57766bbc7fac97c1e42526b22a95a4 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Mon, 17 Nov 2025 17:18:12 +0000
Subject: [PATCH 047/105] [OpenMP][Flang] Change the OmpDefaultMapperName
 suffix (#168399)

This PR fixes a Fortran syntax violation in the OpenMP default mapper
naming convention. The suffix .omp.default.mapper contains dots which
are invalid in Fortran identifiers, causing failures when mappers are
written to and read from module files. The fix changes the suffix to
_omp_default_mapper which uses underscores instead of dots, complying
with Fortran syntax rules.

Key changes:

- Changed OmpDefaultMapperName constant from .omp.default.mapper to
_omp_default_mapper
- Added GetUltimate() calls in mapper symbol resolution to properly
handle symbols across module boundaries
- Added new test case verifying default mappers work correctly when
defined in a module and used in consuming programs

This fixes #168336.
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |  3 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  4 +-
 flang/test/Lower/OpenMP/declare-mapper.f90    | 43 +++++++++++++++++--
 flang/test/Lower/OpenMP/derived-type-map.f90  |  2 +-
 flang/test/Lower/OpenMP/map-mapper.f90        |  4 +-
 flang/test/Lower/OpenMP/target.f90            |  2 +-
 .../Parser/OpenMP/declare-mapper-unparse.f90  |  2 +-
 .../OpenMP/openmp6-directive-spellings.f90    |  2 +-
 .../OpenMP/declare-mapper-symbols.f90         |  2 +-
 .../llvm/Frontend/OpenMP/OMPConstants.h       |  2 +-
 10 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index e018a2d937435..4a392381287d5 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1278,7 +1278,8 @@ void ClauseProcessor::processMapObjects(
     std::string mapperIdName =
         typeSpec->name().ToString() + llvm::omp::OmpDefaultMapperName;
     if (auto *sym = converter.getCurrentScope().FindSymbol(mapperIdName)) {
-      mapperIdName = converter.mangleName(mapperIdName, sym->owner());
+      mapperIdName =
+          converter.mangleName(mapperIdName, sym->GetUltimate().owner());
     } else {
       mapperIdName = converter.mangleName(mapperIdName, *typeSpec->GetScope());
     }
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f822fe3c8dd71..c6487349c4056 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2612,8 +2612,8 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               typeSpec->name().ToString() + llvm::omp::OmpDefaultMapperName;
           if (auto *mapperSym =
                   converter.getCurrentScope().FindSymbol(mapperIdName))
-            mapperIdName =
-                converter.mangleName(mapperIdName, mapperSym->owner());
+            mapperIdName = converter.mangleName(
+                mapperIdName, mapperSym->GetUltimate().owner());
           else
             mapperIdName =
                 converter.mangleName(mapperIdName, *typeSpec->GetScope());
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
index 0266365cf03c0..a24d6bd0bf946 100644
--- a/flang/test/Lower/OpenMP/declare-mapper.f90
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -9,6 +9,8 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-7.mod.f90 -o - >/dev/null
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-7.use.f90 -o - | FileCheck %t/omp-declare-mapper-7.use.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-8.mod.f90 -o - >/dev/null
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-8.use.f90 -o - | FileCheck %t/omp-declare-mapper-8.use.f90
 
 !--- omp-declare-mapper-1.f90
 subroutine declare_mapper_1
@@ -26,7 +28,7 @@ subroutine declare_mapper_1
    end type
    type(my_type2)        :: t
    real                   :: x, y(nvals)
-   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type\.omp\.default\.mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
+   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type_omp_default_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
    !CHECK:      ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE]]>):
    !CHECK:        %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_1Evar"} : (!fir.ref<[[MY_TYPE]]>) -> (!fir.ref<[[MY_TYPE]]>, !fir.ref<[[MY_TYPE]]>)
    !CHECK:        %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0{"values"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -153,7 +155,7 @@ subroutine declare_mapper_4
       integer              :: num
    end type
 
-   !CHECK: omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_4my_type.omp.default.mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_4Tmy_type\{num:i32\}>]]
+   !CHECK: omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_4my_type_omp_default_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_4Tmy_type\{num:i32\}>]]
    !$omp declare mapper (my_type :: var) map (var%num)
 
    type(my_type) :: a
@@ -185,9 +187,9 @@ program declare_mapper_5
    end type
 
    !CHECK: omp.declare_mapper @[[INNER_MAPPER_NAMED:_QQFFuse_innermy_mapper]] : [[MY_TYPE:!fir\.type<_QFTmytype\{x:i32,y:i32\}>]]
-   !CHECK: omp.declare_mapper @[[INNER_MAPPER_DEFAULT:_QQFFuse_innermytype.omp.default.mapper]] : [[MY_TYPE]]
+   !CHECK: omp.declare_mapper @[[INNER_MAPPER_DEFAULT:_QQFFuse_innermytype_omp_default_mapper]] : [[MY_TYPE]]
    !CHECK: omp.declare_mapper @[[OUTER_MAPPER_NAMED:_QQFmy_mapper]] : [[MY_TYPE]]
-   !CHECK: omp.declare_mapper @[[OUTER_MAPPER_DEFAULT:_QQFmytype.omp.default.mapper]] : [[MY_TYPE]]
+   !CHECK: omp.declare_mapper @[[OUTER_MAPPER_DEFAULT:_QQFmytype_omp_default_mapper]] : [[MY_TYPE]]
    !$omp declare mapper(mytype :: var) map(tofrom: var%x)
    !$omp declare mapper(my_mapper : mytype :: var) map(tofrom: var%y)
 
@@ -325,3 +327,36 @@ program use_module_mapper
     a%x = 42
   !$omp end target
 end program use_module_mapper
+
+!--- omp-declare-mapper-8.mod.f90
+! Module with a default DECLARE MAPPER to be compiled separately.
+module default_mapper_mod
+  implicit none
+  type :: dtype
+    integer :: x
+  end type dtype
+  !$omp declare mapper(dtype :: v) map(tofrom: v%x)
+end module default_mapper_mod
+
+!--- omp-declare-mapper-8.use.f90
+! Consumer program that USEs the module and relies on the default mapper.
+! CHECK: omp.declare_mapper @{{.*dtype_omp_default_mapper}} : !fir.type<_QMdefault_mapper_modTdtype{x:i32}>
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(implicit, tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+program use_module_default_mapper
+  use default_mapper_mod
+  implicit none
+  type(dtype) :: a
+  !$omp target map(a)
+    a%x = 7
+  !$omp end target
+
+  !$omp target map(mapper(default) : a)
+    a%x = 8
+  !$omp end target
+
+  !$omp target
+    a%x = 8
+  !$omp end target
+end program use_module_default_mapper
diff --git a/flang/test/Lower/OpenMP/derived-type-map.f90 b/flang/test/Lower/OpenMP/derived-type-map.f90
index 228e86d9e4dfd..921dd5663f8c5 100644
--- a/flang/test/Lower/OpenMP/derived-type-map.f90
+++ b/flang/test/Lower/OpenMP/derived-type-map.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: omp.declare_mapper @[[MAPPER1:_QQFmaptype_derived_implicit_allocatablescalar_and_array.omp.default.mapper]] : !fir.type<_QFmaptype_derived_implicit_allocatableTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {
+!CHECK: omp.declare_mapper @[[MAPPER1:_QQFmaptype_derived_implicit_allocatablescalar_and_array_omp_default_mapper]] : !fir.type<_QFmaptype_derived_implicit_allocatableTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {
 
 !CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_implicitEscalar_arr"}
 !CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_implicitEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
diff --git a/flang/test/Lower/OpenMP/map-mapper.f90 b/flang/test/Lower/OpenMP/map-mapper.f90
index 91564bfc7bc46..8934fbb5d6edf 100644
--- a/flang/test/Lower/OpenMP/map-mapper.f90
+++ b/flang/test/Lower/OpenMP/map-mapper.f90
@@ -8,7 +8,7 @@ program p
    !$omp declare mapper(xx : t1 :: nn) map(to: nn, nn%x)
    !$omp declare mapper(t1 :: nn) map(from: nn)
 
-   !CHECK-LABEL: omp.declare_mapper @_QQFt1.omp.default.mapper : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
+   !CHECK-LABEL: omp.declare_mapper @_QQFt1_omp_default_mapper : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
    !CHECK-LABEL: omp.declare_mapper @_QQFxx : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
 
    type(t1) :: a, b
@@ -20,7 +20,7 @@ program p
    end do
    !$omp end target
 
-   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) mapper(@_QQFt1.omp.default.mapper) -> {{.*}} {name = "b"}
+   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) mapper(@_QQFt1_omp_default_mapper) -> {{.*}} {name = "b"}
    !CHECK: omp.target map_entries(%[[MAP_B]] -> %{{.*}}, %{{.*}} -> %{{.*}} : {{.*}}, {{.*}}) {
    !$omp target map(mapper(default) : b)
    do i = 1, n
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 94907ba3ae74a..7cd642bcf23cf 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -529,7 +529,7 @@ subroutine omp_target_device_ptr
    use iso_c_binding, only : c_ptr, c_loc
    type(c_ptr) :: a
    integer, target :: b
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) mapper(@[[CPTR_DEFAULT:_QQM__fortran_builtinsc_ptr\.omp\.default\.mapper]]) -> {{.*}} {name = "a"}
+   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) mapper(@[[CPTR_DEFAULT:_QQM__fortran_builtinsc_ptr_omp_default_mapper]]) -> {{.*}} {name = "a"}
    !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}} -> %[[VAL_1:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
    !$omp target data map(tofrom: a) use_device_ptr(a)
    !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
diff --git a/flang/test/Parser/OpenMP/declare-mapper-unparse.f90 b/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
index b53bf5ce10557..9da6674c3a58d 100644
--- a/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
@@ -29,7 +29,7 @@ program main
 
 !PARSE-TREE:      OpenMPDeclareMapperConstruct
 !PARSE-TREE:        OmpMapperSpecifier
-!PARSE-TREE:         string = 'ty.omp.default.mapper'
+!PARSE-TREE:         string = 'ty_omp_default_mapper'
 !PARSE-TREE:         TypeSpec -> DerivedTypeSpec
 !PARSE-TREE:           Name = 'ty'
 !PARSE-TREE:         Name = 'mapped'
diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
index 50a38c6494aa6..7a627913f9555 100644
--- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
+++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
@@ -57,7 +57,7 @@ subroutine f01
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareMapperConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare mapper
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpMapperSpecifier
-!PARSE-TREE: | | string = 't.omp.default.mapper'
+!PARSE-TREE: | | string = 't_omp_default_mapper'
 !PARSE-TREE: | | TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 't'
 !PARSE-TREE: | | Name = 'v'
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
index 5d77540aa6453..9a1b86758357f 100644
--- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
+++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
@@ -13,7 +13,7 @@ program main
 !! Note, symbols come out in their respective scope, but not in declaration order.
 !CHECK: mymapper: MapperDetails
 !CHECK: ty: DerivedType components: x
-!CHECK: ty.omp.default.mapper: MapperDetails
+!CHECK: ty_omp_default_mapper: MapperDetails
 !CHECK: DerivedType scope: ty
 !CHECK: OtherConstruct scope:
 !CHECK: mapped (OmpMapToFrom) {{.*}} ObjectEntity type: TYPE(ty)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 1ac9ac040468c..58fd8a490c04a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -201,7 +201,7 @@ enum class OMPDynGroupprivateFallbackType : uint64_t {
 };
 
 // Default OpenMP mapper name suffix.
-inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper";
+inline constexpr const char *OmpDefaultMapperName = "_omp_default_mapper";
 
 /// Values for bit flags used to specify the mapping type for
 /// offloading.

From 2b22e9b13330d47ae22cb0aa8016ddbb567bf94f Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 17 Nov 2025 12:21:46 -0500
Subject: [PATCH 048/105] [SPIRV] Use a worklist in the post-legalizer
 (#165027)

This commit refactors the SPIRV post-legalizer to use a worklist to
process
new instructions. Previously, the post-legalizer would iterate through
all
instructions and try to assign types. This could fail if a new
instruction
depended on another new instruction that had not been processed yet.

The new implementation adds all new instructions that require a SPIR-V
type
to a worklist. It then iteratively processes the worklist until it is
empty.
This ensures that all dependencies are met before an instruction is
processed.

This change makes the post-legalizer more robust and fixes potential
ordering
issues with newly generated instructions.

Existing tests cover existing functionality. More tests will be added as
the legalizer is modified.

Part of #153091
---
 llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp | 370 +++++++++++++++----
 1 file changed, 302 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index d17528dd882bf..751ae0fe34d33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -17,7 +17,8 @@
 #include "SPIRV.h"
 #include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Support/Debug.h"
 #include <stack>
 
 #define DEBUG_TYPE "spirv-postlegalizer"
@@ -43,79 +44,314 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
                          SPIRVType *KnownResType);
 } // namespace llvm
 
-static bool mayBeInserted(unsigned Opcode) {
-  switch (Opcode) {
-  case TargetOpcode::G_SMAX:
-  case TargetOpcode::G_UMAX:
-  case TargetOpcode::G_SMIN:
-  case TargetOpcode::G_UMIN:
-  case TargetOpcode::G_FMINNUM:
-  case TargetOpcode::G_FMINIMUM:
-  case TargetOpcode::G_FMAXNUM:
-  case TargetOpcode::G_FMAXIMUM:
-    return true;
+static SPIRVType *deduceIntTypeFromResult(Register ResVReg,
+                                          MachineIRBuilder &MIB,
+                                          SPIRVGlobalRegistry *GR) {
+  const LLT &Ty = MIB.getMRI()->getType(ResVReg);
+  return GR->getOrCreateSPIRVIntegerType(Ty.getScalarSizeInBits(), MIB);
+}
+
+static bool deduceAndAssignTypeForGUnmerge(MachineInstr *I, MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register SrcReg = I->getOperand(I->getNumOperands() - 1).getReg();
+  SPIRVType *ScalarType = nullptr;
+  if (SPIRVType *DefType = GR->getSPIRVTypeForVReg(SrcReg)) {
+    assert(DefType->getOpcode() == SPIRV::OpTypeVector);
+    ScalarType = GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
+  }
+
+  if (!ScalarType) {
+    // If we could not deduce the type from the source, try to deduce it from
+    // the uses of the results.
+    for (unsigned i = 0; i < I->getNumDefs() && !ScalarType; ++i) {
+      for (const auto &Use :
+           MRI.use_nodbg_instructions(I->getOperand(i).getReg())) {
+        assert(Use.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+               "Expected use of G_UNMERGE_VALUES to be a G_BUILD_VECTOR");
+        if (auto *VecType =
+                GR->getSPIRVTypeForVReg(Use.getOperand(0).getReg())) {
+          ScalarType = GR->getScalarOrVectorComponentType(VecType);
+          break;
+        }
+      }
+    }
+  }
+
+  if (!ScalarType)
+    return false;
+
+  for (unsigned i = 0; i < I->getNumDefs(); ++i) {
+    Register DefReg = I->getOperand(i).getReg();
+    if (GR->getSPIRVTypeForVReg(DefReg))
+      continue;
+
+    LLT DefLLT = MRI.getType(DefReg);
+    SPIRVType *ResType =
+        DefLLT.isVector()
+            ? GR->getOrCreateSPIRVVectorType(
+                  ScalarType, DefLLT.getNumElements(), *I,
+                  *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo())
+            : ScalarType;
+    setRegClassType(DefReg, ResType, GR, &MRI, MF);
+  }
+  return true;
+}
+
+static SPIRVType *deduceTypeFromSingleOperand(MachineInstr *I,
+                                              MachineIRBuilder &MIB,
+                                              SPIRVGlobalRegistry *GR,
+                                              unsigned OpIdx) {
+  Register OpReg = I->getOperand(OpIdx).getReg();
+  if (SPIRVType *OpType = GR->getSPIRVTypeForVReg(OpReg)) {
+    if (SPIRVType *CompType = GR->getScalarOrVectorComponentType(OpType)) {
+      Register ResVReg = I->getOperand(0).getReg();
+      const LLT &ResLLT = MIB.getMRI()->getType(ResVReg);
+      if (ResLLT.isVector())
+        return GR->getOrCreateSPIRVVectorType(CompType, ResLLT.getNumElements(),
+                                              MIB, false);
+      return CompType;
+    }
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceTypeFromOperandRange(MachineInstr *I,
+                                             MachineIRBuilder &MIB,
+                                             SPIRVGlobalRegistry *GR,
+                                             unsigned StartOp, unsigned EndOp) {
+  SPIRVType *ResType = nullptr;
+  for (unsigned i = StartOp; i < EndOp; ++i) {
+    if (SPIRVType *Type = deduceTypeFromSingleOperand(I, MIB, GR, i)) {
+#ifdef EXPENSIVE_CHECKS
+      assert(!ResType || Type == ResType && "Conflicting type from operands.");
+      ResType = Type;
+#else
+      return Type;
+#endif
+    }
+  }
+  return ResType;
+}
+
+static SPIRVType *deduceTypeForResultRegister(MachineInstr *Use,
+                                              Register UseRegister,
+                                              SPIRVGlobalRegistry *GR,
+                                              MachineIRBuilder &MIB) {
+  for (const MachineOperand &MO : Use->defs()) {
+    if (!MO.isReg())
+      continue;
+    if (SPIRVType *OpType = GR->getSPIRVTypeForVReg(MO.getReg())) {
+      if (SPIRVType *CompType = GR->getScalarOrVectorComponentType(OpType)) {
+        const LLT &ResLLT = MIB.getMRI()->getType(UseRegister);
+        if (ResLLT.isVector())
+          return GR->getOrCreateSPIRVVectorType(
+              CompType, ResLLT.getNumElements(), MIB, false);
+        return CompType;
+      }
+    }
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceTypeFromUses(Register Reg, MachineFunction &MF,
+                                     SPIRVGlobalRegistry *GR,
+                                     MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg)) {
+    SPIRVType *ResType = nullptr;
+    switch (Use.getOpcode()) {
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    case TargetOpcode::G_UNMERGE_VALUES:
+      LLVM_DEBUG(dbgs() << "Looking at use " << Use << "\n");
+      ResType = deduceTypeForResultRegister(&Use, Reg, GR, MIB);
+      break;
+    }
+    if (ResType)
+      return ResType;
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceResultTypeFromOperands(MachineInstr *I,
+                                               SPIRVGlobalRegistry *GR,
+                                               MachineIRBuilder &MIB) {
+  Register ResVReg = I->getOperand(0).getReg();
+  switch (I->getOpcode()) {
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_ANYEXT:
+    return deduceIntTypeFromResult(ResVReg, MIB, GR);
+  case TargetOpcode::G_BUILD_VECTOR:
+    return deduceTypeFromOperandRange(I, MIB, GR, 1, I->getNumOperands());
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return deduceTypeFromOperandRange(I, MIB, GR, 1, 3);
   default:
-    return isTypeFoldingSupported(Opcode);
+    if (I->getNumDefs() == 1 && I->getNumOperands() > 1 &&
+        I->getOperand(1).isReg())
+      return deduceTypeFromSingleOperand(I, MIB, GR, 1);
+    return nullptr;
   }
 }
 
-static void processNewInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                             MachineIRBuilder MIB) {
+static bool deduceAndAssignSpirvType(MachineInstr *I, MachineFunction &MF,
+                                     SPIRVGlobalRegistry *GR,
+                                     MachineIRBuilder &MIB) {
+  LLVM_DEBUG(dbgs() << "\nProcessing instruction: " << *I);
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register ResVReg = I->getOperand(0).getReg();
+
+  // G_UNMERGE_VALUES is handled separately because it has multiple definitions,
+  // unlike the other instructions which have a single result register. The main
+  // deduction logic is designed for the single-definition case.
+  if (I->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
+    return deduceAndAssignTypeForGUnmerge(I, MF, GR);
+
+  LLVM_DEBUG(dbgs() << "Inferring type from operands\n");
+  SPIRVType *ResType = deduceResultTypeFromOperands(I, GR, MIB);
+  if (!ResType) {
+    LLVM_DEBUG(dbgs() << "Inferring type from uses\n");
+    ResType = deduceTypeFromUses(ResVReg, MF, GR, MIB);
+  }
+
+  if (!ResType)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Assigned type to " << *I << ": " << *ResType);
+  GR->assignSPIRVTypeToVReg(ResType, ResVReg, MF);
 
+  if (!MRI.getRegClassOrNull(ResVReg)) {
+    LLVM_DEBUG(dbgs() << "Updating the register class.\n");
+    setRegClassType(ResVReg, ResType, GR, &MRI, *GR->CurMF, true);
+  }
+  return true;
+}
+
+static bool requiresSpirvType(MachineInstr &I, SPIRVGlobalRegistry *GR,
+                              MachineRegisterInfo &MRI) {
+  LLVM_DEBUG(dbgs() << "Checking if instruction requires a SPIR-V type: "
+                    << I;);
+  if (I.getNumDefs() == 0) {
+    LLVM_DEBUG(dbgs() << "Instruction does not have a definition.\n");
+    return false;
+  }
+
+  if (!I.isPreISelOpcode()) {
+    LLVM_DEBUG(dbgs() << "Instruction is not a generic instruction.\n");
+    return false;
+  }
+
+  Register ResultRegister = I.defs().begin()->getReg();
+  if (GR->getSPIRVTypeForVReg(ResultRegister)) {
+    LLVM_DEBUG(dbgs() << "Instruction already has a SPIR-V type.\n");
+    if (!MRI.getRegClassOrNull(ResultRegister)) {
+      LLVM_DEBUG(dbgs() << "Updating the register class.\n");
+      setRegClassType(ResultRegister, GR->getSPIRVTypeForVReg(ResultRegister),
+                      GR, &MRI, *GR->CurMF, true);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static void registerSpirvTypeForNewInstructions(MachineFunction &MF,
+                                                SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<MachineInstr *, 8> Worklist;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &I : MBB) {
-      const unsigned Opcode = I.getOpcode();
-      if (Opcode == TargetOpcode::G_UNMERGE_VALUES) {
-        unsigned ArgI = I.getNumOperands() - 1;
-        Register SrcReg = I.getOperand(ArgI).isReg()
-                              ? I.getOperand(ArgI).getReg()
-                              : Register(0);
-        SPIRVType *DefType =
-            SrcReg.isValid() ? GR->getSPIRVTypeForVReg(SrcReg) : nullptr;
-        if (!DefType || DefType->getOpcode() != SPIRV::OpTypeVector)
-          report_fatal_error(
-              "cannot select G_UNMERGE_VALUES with a non-vector argument");
-        SPIRVType *ScalarType =
-            GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
-        for (unsigned i = 0; i < I.getNumDefs(); ++i) {
-          Register ResVReg = I.getOperand(i).getReg();
-          SPIRVType *ResType = GR->getSPIRVTypeForVReg(ResVReg);
-          if (!ResType) {
-            // There was no "assign type" actions, let's fix this now
-            ResType = ScalarType;
-            setRegClassType(ResVReg, ResType, GR, &MRI, *GR->CurMF, true);
-          }
-        }
-      } else if (mayBeInserted(Opcode) && I.getNumDefs() == 1 &&
-                 I.getNumOperands() > 1 && I.getOperand(1).isReg()) {
-        // Legalizer may have added a new instructions and introduced new
-        // registers, we must decorate them as if they were introduced in a
-        // non-automatic way
-        Register ResVReg = I.getOperand(0).getReg();
-        // Check if the register defined by the instruction is newly generated
-        // or already processed
-        // Check if we have type defined for operands of the new instruction
-        bool IsKnownReg = MRI.getRegClassOrNull(ResVReg);
-        SPIRVType *ResVType = GR->getSPIRVTypeForVReg(
-            IsKnownReg ? ResVReg : I.getOperand(1).getReg());
-        if (!ResVType)
-          continue;
-        // Set type & class
-        if (!IsKnownReg)
-          setRegClassType(ResVReg, ResVType, GR, &MRI, *GR->CurMF, true);
-        // If this is a simple operation that is to be reduced by TableGen
-        // definition we must apply some of pre-legalizer rules here
-        if (isTypeFoldingSupported(Opcode)) {
-          processInstr(I, MIB, MRI, GR, GR->getSPIRVTypeForVReg(ResVReg));
-          if (IsKnownReg && MRI.hasOneUse(ResVReg)) {
-            MachineInstr &UseMI = *MRI.use_instr_begin(ResVReg);
-            if (UseMI.getOpcode() == SPIRV::ASSIGN_TYPE)
-              continue;
-          }
-          insertAssignInstr(ResVReg, nullptr, ResVType, GR, MIB, MRI);
+      if (requiresSpirvType(I, GR, MRI)) {
+        Worklist.push_back(&I);
+      }
+    }
+  }
+
+  if (Worklist.empty()) {
+    LLVM_DEBUG(dbgs() << "Initial worklist is empty.\n");
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Initial worklist:\n";
+             for (auto *I : Worklist) { I->dump(); });
+
+  bool Changed;
+  do {
+    Changed = false;
+    SmallVector<MachineInstr *, 8> NextWorklist;
+
+    for (MachineInstr *I : Worklist) {
+      MachineIRBuilder MIB(*I);
+      if (deduceAndAssignSpirvType(I, MF, GR, MIB)) {
+        Changed = true;
+      } else {
+        NextWorklist.push_back(I);
+      }
+    }
+    Worklist = std::move(NextWorklist);
+    LLVM_DEBUG(dbgs() << "Worklist size: " << Worklist.size() << "\n");
+  } while (Changed);
+
+  if (Worklist.empty())
+    return;
+
+  for (auto *I : Worklist) {
+    MachineIRBuilder MIB(*I);
+    Register ResVReg = I->getOperand(0).getReg();
+    const LLT &ResLLT = MRI.getType(ResVReg);
+    SPIRVType *ResType = nullptr;
+    if (ResLLT.isVector()) {
+      SPIRVType *CompType = GR->getOrCreateSPIRVIntegerType(
+          ResLLT.getElementType().getSizeInBits(), MIB);
+      ResType = GR->getOrCreateSPIRVVectorType(
+          CompType, ResLLT.getNumElements(), MIB, false);
+    } else {
+      ResType = GR->getOrCreateSPIRVIntegerType(ResLLT.getSizeInBits(), MIB);
+    }
+    LLVM_DEBUG(dbgs() << "Could not determine type for " << *I
+                      << ", defaulting to " << *ResType << "\n");
+    setRegClassType(ResVReg, ResType, GR, &MRI, MF, true);
+  }
+}
+
+static void ensureAssignTypeForTypeFolding(MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR) {
+  LLVM_DEBUG(dbgs() << "Entering ensureAssignTypeForTypeFolding for function "
+                    << MF.getName() << "\n");
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isTypeFoldingSupported(MI.getOpcode()))
+        continue;
+      if (MI.getNumOperands() == 1 || !MI.getOperand(1).isReg())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Processing instruction: " << MI);
+
+      // Check uses of MI to see if it already has an use in SPIRV::ASSIGN_TYPE
+      bool HasAssignType = false;
+      Register ResultRegister = MI.defs().begin()->getReg();
+      // All uses of Result register
+      for (MachineInstr &UseInstr :
+           MRI.use_nodbg_instructions(ResultRegister)) {
+        if (UseInstr.getOpcode() == SPIRV::ASSIGN_TYPE) {
+          HasAssignType = true;
+          LLVM_DEBUG(dbgs() << "  Instruction already has an ASSIGN_TYPE use: "
+                            << UseInstr);
+          break;
         }
       }
+
+      if (!HasAssignType) {
+        Register ResultRegister = MI.defs().begin()->getReg();
+        SPIRVType *ResultType = GR->getSPIRVTypeForVReg(ResultRegister);
+        LLVM_DEBUG(
+            dbgs() << "  Adding ASSIGN_TYPE for ResultRegister: "
+                   << printReg(ResultRegister, MRI.getTargetRegisterInfo())
+                   << " with type: " << *ResultType);
+        MachineIRBuilder MIB(MI);
+        insertAssignInstr(ResultRegister, nullptr, ResultType, GR, MIB, MRI);
+      }
     }
   }
 }
@@ -155,10 +391,8 @@ bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) {
   const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>();
   SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
   GR->setCurrentFunc(MF);
-  MachineIRBuilder MIB(MF);
-
-  processNewInstrs(MF, GR, MIB);
-
+  registerSpirvTypeForNewInstructions(MF, GR);
+  ensureAssignTypeForTypeFolding(MF, GR);
   return true;
 }
 

From be6296ea8faccec5d2fbaa2625112e26a5deeb85 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 17 Nov 2025 18:32:32 +0100
Subject: [PATCH 049/105] [RISCV] Fold Zba-expanded (mul (shr exact X, C1), C2)
 (#168019)

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h  |  5 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 11 ++--
 llvm/test/CodeGen/RISCV/rv64zba.ll          | 71 +++++++++++++++++++++
 3 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 511cb56f73dcb..557dbf8c7ca39 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -903,6 +903,11 @@ template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_Srl(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R);
 }
+template <typename LHS, typename RHS>
+inline auto m_ExactSr(const LHS &L, const RHS &R) {
+  return m_AnyOf(BinaryOpc_match<LHS, RHS>(ISD::SRA, L, R, SDNodeFlags::Exact),
+                 BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R, SDNodeFlags::Exact));
+}
 
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_Rotl(const LHS &L, const RHS &R) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f313d3f1347d4..fb298ee35d6c2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16798,9 +16798,7 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
     // because X is exact (Y >> M + 2).
     uint64_t ShAmt = Log2_64(MulAmtLowBit) + 2;
     using namespace SDPatternMatch;
-    return sd_match(X, m_AnyOf(m_Sra(m_Value(), m_SpecificInt(ShAmt)),
-                               m_Srl(m_Value(), m_SpecificInt(ShAmt)))) &&
-           X->getFlags().hasExact();
+    return sd_match(X, m_ExactSr(m_Value(), m_SpecificInt(ShAmt)));
   };
   if (isPowerOf2_64(MulAmt - MulAmtLowBit) && !(CanSub && PreferSub())) {
     Op = ISD::ADD;
@@ -16825,10 +16823,13 @@ static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
-  // Put the shift first if we can fold a zext into the shift forming a slli.uw.
+  // Put the shift first if we can fold:
+  // a. a zext into the shift forming a slli.uw
+  // b. an exact shift right forming one shorter shift or no shift at all
   using namespace SDPatternMatch;
   if (Shift != 0 &&
-      sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
+      sd_match(X, m_AnyOf(m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))),
+                          m_ExactSr(m_Value(), m_ConstInt())))) {
     X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
     Shift = 0;
   }
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 4ab4ff84dac57..fb26b8b16a290 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -5016,3 +5016,74 @@ define ptr @shl_add_knownbits(ptr %p, i64 %i) {
   %r = getelementptr i8, ptr %p, i64 %shr
   ret ptr %r
 }
+
+define i64 @exactashr1mul6(i64 %a) {
+; RV64I-LABEL: exactashr1mul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactashr1mul6:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactashr1mul6:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = ashr exact i64 %a, 1
+  %d = mul i64 %c, 6
+  ret i64 %d
+}
+
+define i64 @exactlshr3mul22(i64 %a) {
+; RV64I-LABEL: exactlshr3mul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 3
+; RV64I-NEXT:    li a1, 22
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactlshr3mul22:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactlshr3mul22:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    srli a0, a0, 2
+; RV64XANDESPERF-NEXT:    nds.lea.w a1, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a1
+; RV64XANDESPERF-NEXT:    ret
+  %c = lshr exact i64 %a, 3
+  %d = mul i64 %c, 22
+  ret i64 %d
+}
+
+define i64 @exactashr1mul36(i64 %a) {
+; RV64I-LABEL: exactashr1mul36:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactashr1mul36:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a0, a0, 1
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactashr1mul36:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 1
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = ashr exact i64 %a, 1
+  %d = mul i64 %c, 36
+  ret i64 %d
+}

From a9a4515b0a442ea58826047b7efb9aa2bfe48749 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Mon, 17 Nov 2025 09:38:02 -0800
Subject: [PATCH 050/105] [lld][MachO] Read cstring order for non deduped
 sections (#161879)

https://github.com/llvm/llvm-project/pull/140307 added support for
cstring hashes in the orderfile to layout cstrings in a specific order,
but only when `--deduplicate-strings` is used. This PR supports cstring
ordering when `--no-deduplicate-strings` is used.

1. Create `cStringPriorities`, separate from `priorities`, to hold only
priorities for cstring pieces. This allows us to lookup by hash
directly, instead of first converting to a string. It also fixes a
contrived bug where we want to order a symbol named `CSTR;12345` rather
than a cstring.

2. Rather than calling `buildCStringPriorities()` which always
constructs and returns a vector, we use `forEachStringPiece()` to
efficiently iterate over cstring pieces without creating a new vector if
no cstring is ordered.

3. Create `SymbolPriorityEntry::{get,set}Priority()` helper functions to
simplify code.
---
 lld/MachO/SectionPriorities.cpp     | 130 +++++++++++++++-------------
 lld/MachO/SectionPriorities.h       |  28 ++++--
 lld/MachO/SyntheticSections.cpp     |  62 +++++++------
 lld/test/MachO/order-file-cstring.s |  22 ++---
 4 files changed, 129 insertions(+), 113 deletions(-)

diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index cf657aad5d145..b652d1ee8325f 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 
 #include <numeric>
 
@@ -246,33 +247,45 @@ DenseMap<const InputSection *, int> CallGraphSort::run() {
   return orderMap;
 }
 
-std::optional<int>
-macho::PriorityBuilder::getSymbolOrCStringPriority(const StringRef key,
-                                                   InputFile *f) {
+void macho::PriorityBuilder::SymbolPriorityEntry::setPriority(
+    int priority, StringRef objectFile) {
+  if (!objectFile.empty())
+    objectFiles.try_emplace(objectFile, priority);
+  else
+    anyObjectFile = std::min(anyObjectFile, priority);
+}
 
-  auto it = priorities.find(key);
-  if (it == priorities.end())
-    return std::nullopt;
-  const SymbolPriorityEntry &entry = it->second;
+int macho::PriorityBuilder::SymbolPriorityEntry::getPriority(
+    const InputFile *f) const {
   if (!f)
-    return entry.anyObjectFile;
+    return anyObjectFile;
   // We don't use toString(InputFile *) here because it returns the full path
   // for object files, and we only want the basename.
-  StringRef filename;
-  if (f->archiveName.empty())
-    filename = path::filename(f->getName());
-  else
-    filename = saver().save(path::filename(f->archiveName) + "(" +
-                            path::filename(f->getName()) + ")");
-  return std::min(entry.objectFiles.lookup(filename), entry.anyObjectFile);
+  StringRef basename = path::filename(f->getName());
+  StringRef filename =
+      f->archiveName.empty()
+          ? basename
+          : saver().save(path::filename(f->archiveName) + "(" + basename + ")");
+  return std::min(objectFiles.lookup(filename), anyObjectFile);
 }
 
 std::optional<int>
-macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
+macho::PriorityBuilder::getCStringPriority(uint32_t hash,
+                                           const InputFile *f) const {
+  auto it = cStringPriorities.find(hash);
+  if (it == cStringPriorities.end())
+    return std::nullopt;
+  return it->second.getPriority(f);
+}
+
+std::optional<int>
+macho::PriorityBuilder::getSymbolPriority(const Defined *sym) const {
   if (sym->isAbsolute())
     return std::nullopt;
-  return getSymbolOrCStringPriority(utils::getRootSymbol(sym->getName()),
-                                    sym->isec()->getFile());
+  auto it = priorities.find(utils::getRootSymbol(sym->getName()));
+  if (it == priorities.end())
+    return std::nullopt;
+  return it->second.getPriority(sym->isec()->getFile());
 }
 
 void macho::PriorityBuilder::extractCallGraphProfile() {
@@ -307,7 +320,7 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
   int prio = std::numeric_limits<int>::min();
   MemoryBufferRef mbref = *buffer;
   for (StringRef line : args::getLines(mbref)) {
-    StringRef objectFile, symbolOrCStrHash;
+    StringRef objectFile;
     line = line.take_until([](char c) { return c == '#'; }); // ignore comments
     line = line.ltrim();
 
@@ -338,22 +351,16 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
     }
 
     // The rest of the line is either <symbol name> or
-    // CStringEntryPrefix<cstring hash>
+    // cStringEntryPrefix<cstring hash>
     line = line.trim();
-    if (line.starts_with(CStringEntryPrefix)) {
-      StringRef possibleHash = line.drop_front(CStringEntryPrefix.size());
+    if (line.consume_front(cStringEntryPrefix)) {
       uint32_t hash = 0;
-      if (to_integer(possibleHash, hash))
-        symbolOrCStrHash = possibleHash;
-    } else
-      symbolOrCStrHash = utils::getRootSymbol(line);
-
-    if (!symbolOrCStrHash.empty()) {
-      SymbolPriorityEntry &entry = priorities[symbolOrCStrHash];
-      if (!objectFile.empty())
-        entry.objectFiles.insert(std::make_pair(objectFile, prio));
-      else
-        entry.anyObjectFile = std::min(entry.anyObjectFile, prio);
+      if (to_integer(line, hash))
+        cStringPriorities[hash].setPriority(prio, objectFile);
+    } else {
+      StringRef symbol = utils::getRootSymbol(line);
+      if (!symbol.empty())
+        priorities[symbol].setPriority(prio, objectFile);
     }
 
     ++prio;
@@ -405,40 +412,39 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
   return sectionPriorities;
 }
 
-std::vector<StringPiecePair> macho::PriorityBuilder::buildCStringPriorities(
-    ArrayRef<CStringInputSection *> inputs) {
-  // Split the input strings into hold and cold sets.
-  // Order hot set based on -order_file_cstring for performance improvement;
-  // TODO: Order cold set of cstrings for compression via BP.
-  std::vector<std::pair<int, StringPiecePair>>
-      hotStringPrioritiesAndStringPieces;
-  std::vector<StringPiecePair> coldStringPieces;
-  std::vector<StringPiecePair> orderedStringPieces;
-
+void macho::PriorityBuilder::forEachStringPiece(
+    ArrayRef<CStringInputSection *> inputs,
+    std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+    bool forceInputOrder, bool computeHash) const {
+  std::vector<std::tuple<int, CStringInputSection *, size_t>> orderedPieces;
+  std::vector<std::pair<CStringInputSection *, size_t>> unorderedPieces;
   for (CStringInputSection *isec : inputs) {
     for (const auto &[stringPieceIdx, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
         continue;
-
-      std::optional<int> priority = getSymbolOrCStringPriority(
-          std::to_string(piece.hash), isec->getFile());
-      if (!priority)
-        coldStringPieces.emplace_back(isec, stringPieceIdx);
+      // Process pieces in input order if we have no cstrings in our orderfile
+      if (forceInputOrder || cStringPriorities.empty()) {
+        f(*isec, piece, stringPieceIdx);
+        continue;
+      }
+      uint32_t hash =
+          computeHash
+              ? (xxh3_64bits(isec->getStringRef(stringPieceIdx)) & 0x7fffffff)
+              : piece.hash;
+      if (auto priority = getCStringPriority(hash, isec->getFile()))
+        orderedPieces.emplace_back(*priority, isec, stringPieceIdx);
       else
-        hotStringPrioritiesAndStringPieces.emplace_back(
-            *priority, std::make_pair(isec, stringPieceIdx));
+        unorderedPieces.emplace_back(isec, stringPieceIdx);
     }
   }
-
-  // Order hot set for perf
-  llvm::stable_sort(hotStringPrioritiesAndStringPieces);
-  for (auto &[priority, stringPiecePair] : hotStringPrioritiesAndStringPieces)
-    orderedStringPieces.push_back(stringPiecePair);
-
-  // TODO: Order cold set for compression
-
-  orderedStringPieces.insert(orderedStringPieces.end(),
-                             coldStringPieces.begin(), coldStringPieces.end());
-
-  return orderedStringPieces;
+  if (orderedPieces.empty() && unorderedPieces.empty())
+    return;
+  llvm::stable_sort(orderedPieces, [](const auto &left, const auto &right) {
+    return std::get<0>(left) < std::get<0>(right);
+  });
+  for (auto &[priority, isec, pieceIdx] : orderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
+  // TODO: Add option to order the remaining cstrings for compression
+  for (auto &[isec, pieceIdx] : unorderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
 }
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index cc4e30fffc600..24d2dbc47e498 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -16,7 +16,6 @@
 namespace lld::macho {
 
 using SectionPair = std::pair<const InputSection *, const InputSection *>;
-using StringPiecePair = std::pair<CStringInputSection *, size_t>;
 
 class PriorityBuilder {
 public:
@@ -29,7 +28,7 @@ class PriorityBuilder {
   //
   // An order file has one entry per line, in the following format:
   //
-  //   <cpu>:<object file>:[<symbol name> | CStringEntryPrefix <cstring hash>]
+  //   <cpu>:<object file>:[<symbol name> | cStringEntryPrefix <cstring hash>]
   //
   // <cpu> and <object file> are optional.
   // If not specified, then that entry tries to match either,
@@ -42,7 +41,7 @@ class PriorityBuilder {
   // lowest-ordered entry (the one nearest to the front of the list.)
   //
   // or 2) any cstring literal with the given hash, if the entry has the
-  // CStringEntryPrefix prefix defined below in the file. <cstring hash> is the
+  // cStringEntryPrefix prefix defined below in the file. <cstring hash> is the
   // hash of cstring literal content.
   //
   // Cstring literals are not symbolized, we can't identify them by name
@@ -54,6 +53,16 @@ class PriorityBuilder {
   // The file can also have line comments that start with '#'.
   void parseOrderFile(StringRef path);
 
+  /// Call \p f for each string piece in \p inputs. If there are any cstring
+  /// literals in the orderfile (and \p forceInputOrder is false) then string
+  /// pieces are ordered by the orderfile. \p computeHash must be set when
+  /// \p deduplicateLiterals is false because then the string piece hash is not
+  /// set.
+  void forEachStringPiece(
+      ArrayRef<CStringInputSection *> inputs,
+      std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+      bool forceInputOrder = false, bool computeHash = false) const;
+
   // Returns layout priorities for some or all input sections. Sections are laid
   // out in decreasing order; that is, a higher priority section will be closer
   // to the beginning of its output section.
@@ -66,8 +75,6 @@ class PriorityBuilder {
   // Each section gets assigned the priority of the highest-priority symbol it
   // contains.
   llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
-  std::vector<StringPiecePair>
-      buildCStringPriorities(ArrayRef<CStringInputSection *>);
 
 private:
   // The symbol with the smallest priority should be ordered first in the output
@@ -78,13 +85,16 @@ class PriorityBuilder {
     int anyObjectFile = 0;
     // The priority given to a matching symbol from a particular object file.
     llvm::DenseMap<llvm::StringRef, int> objectFiles;
+    void setPriority(int priority, StringRef objectFile);
+    int getPriority(const InputFile *f) const;
   };
-  const llvm::StringRef CStringEntryPrefix = "CSTR;";
+  const llvm::StringRef cStringEntryPrefix = "CSTR;";
 
-  std::optional<int> getSymbolPriority(const Defined *sym);
-  std::optional<int> getSymbolOrCStringPriority(const StringRef key,
-                                                InputFile *f);
+  std::optional<int> getSymbolPriority(const Defined *sym) const;
+  std::optional<int> getCStringPriority(uint32_t hash,
+                                        const InputFile *f) const;
   llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
+  llvm::DenseMap<uint32_t, SymbolPriorityEntry> cStringPriorities;
   llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
 };
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 187cccbe90dbc..fecc51f912b08 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1721,26 +1721,24 @@ void CStringSection::writeTo(uint8_t *buf) const {
 // and don't need this alignment. They will be emitted at some arbitrary address
 // `A`, but ld64 will treat them as being 16-byte aligned with an offset of
 // `16 % A`.
-static Align getStringPieceAlignment(const CStringInputSection *isec,
+static Align getStringPieceAlignment(const CStringInputSection &isec,
                                      const StringPiece &piece) {
-  return llvm::Align(1ULL << llvm::countr_zero(isec->align | piece.inSecOff));
+  return llvm::Align(1ULL << llvm::countr_zero(isec.align | piece.inSecOff));
 }
 
 void CStringSection::finalizeContents() {
   size = 0;
-  // TODO: Call buildCStringPriorities() to support cstring ordering when
-  // deduplication is off, although this may negatively impact build
-  // performance.
-  for (CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
-      StringRef string = isec->getStringRef(i);
-      size = piece.outSecOff + string.size() + 1; // account for null terminator
-    }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
+        StringRef string = isec.getStringRef(pieceIdx);
+        size =
+            piece.outSecOff + string.size() + 1; // account for null terminator
+      },
+      /*forceInputOrder=*/false, /*computeHash=*/true);
+  for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
-  }
 }
 
 void DeduplicatedCStringSection::finalizeContents() {
@@ -1748,20 +1746,19 @@ void DeduplicatedCStringSection::finalizeContents() {
   DenseMap<CachedHashStringRef, Align> strToAlignment;
   // Used for tail merging only
   std::vector<CachedHashStringRef> deduplicatedStrs;
-  for (const CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      auto s = isec->getCachedHashStringRef(i);
-      assert(isec->align != 0);
-      auto align = getStringPieceAlignment(isec, piece);
-      auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
-      if (config->tailMergeStrings && wasInserted)
-        deduplicatedStrs.push_back(s);
-      if (!wasInserted && it->second < align)
-        it->second = align;
-    }
-  }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        auto s = isec.getCachedHashStringRef(pieceIdx);
+        assert(isec.align != 0);
+        auto align = getStringPieceAlignment(isec, piece);
+        auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+        if (config->tailMergeStrings && wasInserted)
+          deduplicatedStrs.push_back(s);
+        if (!wasInserted && it->second < align)
+          it->second = align;
+      },
+      /*forceInputOrder=*/true);
 
   // Like lexigraphical sort, except we read strings in reverse and take the
   // longest string first
@@ -1801,9 +1798,10 @@ void DeduplicatedCStringSection::finalizeContents() {
   // Sort the strings for performance and compression size win, and then
   // assign an offset for each string and save it to the corresponding
   // StringPieces for easy access.
-  for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
-    auto &piece = isec->pieces[i];
-    auto s = isec->getCachedHashStringRef(i);
+  priorityBuilder.forEachStringPiece(inputs, [&](CStringInputSection &isec,
+                                                 StringPiece &piece,
+                                                 size_t pieceIdx) {
+    auto s = isec.getCachedHashStringRef(pieceIdx);
     // Any string can be tail merged with itself with an offset of zero
     uint64_t tailMergeOffset = 0;
     auto mergeIt =
@@ -1829,7 +1827,7 @@ void DeduplicatedCStringSection::finalizeContents() {
       stringOffsetMap[tailMergedString] = piece.outSecOff;
       assert(isAligned(strToAlignment.at(tailMergedString), piece.outSecOff));
     }
-  }
+  });
   for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
 }
diff --git a/lld/test/MachO/order-file-cstring.s b/lld/test/MachO/order-file-cstring.s
index 3c6d2a377dc38..d6734308fffdf 100644
--- a/lld/test/MachO/order-file-cstring.s
+++ b/lld/test/MachO/order-file-cstring.s
@@ -4,32 +4,34 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin  %t/test.s -o %t/test.o
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/more-cstrings.s -o %t/more-cstrings.o
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1 | FileCheck %s --check-prefix=ONE_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1 | FileCheck %s --check-prefix=ONE_SEC
 
+# RUN: %lld --no-deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1-dup %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1-dup | FileCheck %s --check-prefix=ONE_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1-dup | FileCheck %s --check-prefix=ONE_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-2 | FileCheck %s --check-prefix=TWO_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-2 | FileCheck %s --check-prefix=TWO_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-3 | FileCheck %s --check-prefix=THREE_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-3 | FileCheck %s --check-prefix=THREE_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-4 | FileCheck %s --check-prefix=FOUR_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC
 # RUN: llvm-readobj --string-dump=__cstring %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC_ESCAPE
 
-
 # We expect:
-# 1) Covered cstring symbols are reordered
-# 2) the rest of the cstring symbols remain original relative order within the cstring section
+# 1) Covered cstring symbols to be reordered
+# 2) the rest of the cstring symbols remain in the original relative order within the cstring section
 
 # ORIGIN_SYM: _local_foo1
 # ORIGIN_SYM: _globl_foo2
@@ -58,8 +60,8 @@ CSTR;1496286555
 #foo3
 CSTR;1343999025
 
-# ONE_SYM: _globl_foo2
-# ONE_SYM: _local_foo2
+# ONE_SYM-DAG: _globl_foo2
+# ONE_SYM-DAG: _local_foo2
 # ONE_SYM: _bar
 # ONE_SYM: _bar2
 # ONE_SYM: _globl_foo3

From eb879ac50b27d4651d8650b7d769cf651d0a89bd Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 09:45:14 -0800
Subject: [PATCH 051/105] [CI] Make premerge upload/write comments (#166609)

This only does this for Linux currently as the issue-write workflow
currently does not support writing out multiple comments. This gets the
ball rolling as the failures that most people see are common to both
platforms. Ensuring we have coverage on Windows for comments will be
done in a future patch.
---
 .ci/premerge_advisor_explain.py | 3 ++-
 .github/workflows/premerge.yaml | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index e1bc59f389b36..269f75cace266 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -39,6 +39,7 @@ def get_comment(
 ) -> dict[str, str]:
     repo = github.Github(github_token).get_repo("llvm/llvm-project")
     pr = repo.get_issue(pr_number).as_pull_request()
+    body = COMMENT_TAG.format(platform=platform.system()) + "\n" + body
     comment = {"body": body}
     comment_id = get_comment_id(platform.system(), pr)
     if comment_id:
@@ -128,7 +129,7 @@ def main(
                 ),
             )
         ]
-        with open("comment", "w") as comment_file_handle:
+        with open("comments", "w") as comment_file_handle:
             json.dump(comments, comment_file_handle)
     else:
         print(advisor_response.reason)
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 02a6f3b868d85..daf88b5b22125 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -119,6 +119,14 @@ jobs:
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
+      - name: Upload Comment
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: ${{ always() && !startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') }}
+        continue-on-error: true
+        with:
+          name: workflow-args
+          path: |
+            comments
 
   premerge-checks-windows:
     name: Build and Test Windows

From a770d2b439ec246002cd77ce33e52f6efa577849 Mon Sep 17 00:00:00 2001
From: Jeremy Furtek <jfurtek@nvidia.com>
Date: Mon, 17 Nov 2025 11:46:39 -0600
Subject: [PATCH 052/105] Add 'exact' flag to arith.shrui/shrsi/divsi/divui
 operations (#165923)

This MR adds support for the `exact` flag to the
`arith.shrui/shrsi/divsi/divui` operations. The semantics are identical
to those of the LLVM dialect and the LLVM language reference.

This MR also modifies the mechanism for converting `arith` dialect
**attributes** to corresponding **properties** in the `LLVM` dialect.
(As a specific example, the integer overflow flags `nsw/nuw` are
**properties** in the `LLVM` dialect, as opposed to attributes.)

Previously, attribute converter classes were required to have a specific
method to support integer overflow flags:
```C++
template <typename SourceOp, typename TargetOp>
class AttrConvertPassThrough {
public:
  ...
  LLVM::IntegerOverflowFlags getOverflowFlags() const {
    return LLVM::IntegerOverflowFlags::none;
  }
};
```
This method was required, even for `arith` source operations that did
not use integer overflow flags (e.g. `AttrConvertFastMathToLLVM`).

This MR modifies the interface required by `arith` dialect attribute
converters to instead provide a (possibly NULL) properties attribute:
```C++
template <typename SourceOp, typename TargetOp>
class AttrConvertPassThrough {
public:
  ...
  Attribute getPropAttr() const { return {}; }
};
```
For `arith` operations with attributes that map to `LLVM` dialect
**properties**, the attribute converter can create a `DictionaryAttr`
containing target properties and return that attribute from the
attribute converter's `getPropAttr()` method. The `arith` attribute
conversion framework will set the `propertiesAttr` of an
`OperationState`, and the target operation's `setPropertiesFromAttr()`
method will be invoked to set the properties when the target operation
is created. The `AttrConvertOverflowToLLVM` class in this MR uses the
new approach.
---
 .../ArithCommon/AttrToLLVMConverter.h         | 32 ++++++-----
 .../mlir/Conversion/LLVMCommon/Pattern.h      | 20 ++++---
 .../Conversion/LLVMCommon/VectorPattern.h     | 34 ++++++------
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 54 +++++++++++++++----
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |  1 +
 .../ComplexToLLVM/ComplexToLLVM.cpp           |  3 +-
 mlir/lib/Conversion/LLVMCommon/Pattern.cpp    | 21 +++-----
 .../Conversion/LLVMCommon/VectorPattern.cpp   | 21 ++++----
 .../Dialect/Arith/IR/ArithCanonicalization.td |  4 +-
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir | 16 ++++++
 mlir/test/Dialect/Arith/canonicalize.mlir     | 13 +++++
 mlir/test/Dialect/Arith/ops.mlir              | 24 +++++++++
 12 files changed, 167 insertions(+), 76 deletions(-)

diff --git a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
index 7ffc861331760..7020e24517d09 100644
--- a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
+++ b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
@@ -65,11 +65,8 @@ class AttrConvertFastMathToLLVM {
                         convertArithFastMathAttrToLLVM(arithFMFAttr));
     }
   }
-
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   NamedAttrList convertedAttr;
@@ -82,23 +79,36 @@ template <typename SourceOp, typename TargetOp>
 class AttrConvertOverflowToLLVM {
 public:
   AttrConvertOverflowToLLVM(SourceOp srcOp) {
+    using IntegerOverflowFlagsAttr = LLVM::IntegerOverflowFlagsAttr;
+
     // Copy the source attributes.
     convertedAttr = NamedAttrList{srcOp->getAttrs()};
     // Get the name of the arith overflow attribute.
     StringRef arithAttrName = SourceOp::getIntegerOverflowAttrName();
-    // Remove the source overflow attribute.
+    // Remove the source overflow attribute from the set that will be present
+    // in the target.
     if (auto arithAttr = dyn_cast_if_present<arith::IntegerOverflowFlagsAttr>(
             convertedAttr.erase(arithAttrName))) {
-      overflowFlags = convertArithOverflowFlagsToLLVM(arithAttr.getValue());
+      auto llvmFlag = convertArithOverflowFlagsToLLVM(arithAttr.getValue());
+      // Create a dictionary attribute holding the overflow flags property.
+      // (In the LLVM dialect, the overflow flags are a property, not an
+      // attribute.)
+      MLIRContext *ctx = srcOp.getOperation()->getContext();
+      Builder b(ctx);
+      auto llvmFlagAttr = IntegerOverflowFlagsAttr::get(ctx, llvmFlag);
+      StringRef llvmAttrName = TargetOp::getOverflowFlagsAttrName();
+      NamedAttribute attr{llvmAttrName, llvmFlagAttr};
+      // Set the properties attribute of the operation state so that the
+      // property can be updated when the operation is created.
+      propertiesAttr = b.getDictionaryAttr(ArrayRef(attr));
     }
   }
-
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const { return overflowFlags; }
+  Attribute getPropAttr() const { return propertiesAttr; }
 
 private:
   NamedAttrList convertedAttr;
-  LLVM::IntegerOverflowFlags overflowFlags = LLVM::IntegerOverflowFlags::none;
+  DictionaryAttr propertiesAttr;
 };
 
 template <typename SourceOp, typename TargetOp>
@@ -129,9 +139,7 @@ class AttrConverterConstrainedFPToLLVM {
   }
 
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   NamedAttrList convertedAttr;
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
index c292e3727f46c..f8e0ccc093f8b 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -19,16 +19,14 @@ class CallOpInterface;
 
 namespace LLVM {
 namespace detail {
-/// Handle generically setting flags as native properties on LLVM operations.
-void setNativeProperties(Operation *op, IntegerOverflowFlags overflowFlags);
-
 /// Replaces the given operation "op" with a new operation of type "targetOp"
 /// and given operands.
-LogicalResult oneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none);
+LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp,
+                              ValueRange operands,
+                              ArrayRef<NamedAttribute> targetAttrs,
+                              Attribute propertiesAttr,
+                              const LLVMTypeConverter &typeConverter,
+                              ConversionPatternRewriter &rewriter);
 
 /// Replaces the given operation "op" with a call to an LLVM intrinsic with the
 /// specified name "intrinsic" and operands.
@@ -307,9 +305,9 @@ class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(),
-                                         adaptor.getOperands(), op->getAttrs(),
-                                         *this->getTypeConverter(), rewriter);
+    return LLVM::detail::oneToOneRewrite(
+        op, TargetOp::getOperationName(), adaptor.getOperands(), op->getAttrs(),
+        /*propertiesAttr=*/Attribute{}, *this->getTypeConverter(), rewriter);
   }
 };
 
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index e7ab63abfeaa1..47b8381eefda8 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -54,25 +54,26 @@ LogicalResult handleMultidimensionalVectors(
     std::function<Value(Type, ValueRange)> createOperand,
     ConversionPatternRewriter &rewriter);
 
-LogicalResult vectorOneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none);
+LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp,
+                                    ValueRange operands,
+                                    ArrayRef<NamedAttribute> targetAttrs,
+                                    Attribute propertiesAttr,
+                                    const LLVMTypeConverter &typeConverter,
+                                    ConversionPatternRewriter &rewriter);
 } // namespace detail
 } // namespace LLVM
 
 // Default attribute conversion class, which passes all source attributes
-// through to the target op, unmodified.
+// through to the target op, unmodified. The attribute to set properties of the
+// target operation will be nullptr (i.e. any properties that exist in will have
+// default values).
 template <typename SourceOp, typename TargetOp>
 class AttrConvertPassThrough {
 public:
   AttrConvertPassThrough(SourceOp srcOp) : srcAttrs(srcOp->getAttrs()) {}
 
   ArrayRef<NamedAttribute> getAttrs() const { return srcAttrs; }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   ArrayRef<NamedAttribute> srcAttrs;
@@ -80,10 +81,13 @@ class AttrConvertPassThrough {
 
 /// Basic lowering implementation to rewrite Ops with just one result to the
 /// LLVM Dialect. This supports higher-dimensional vector types.
-/// The AttrConvert template template parameter should be a template class
-/// with SourceOp and TargetOp type parameters, a constructor that takes
-/// a SourceOp instance, and a getAttrs() method that returns
-/// ArrayRef<NamedAttribute>.
+/// The AttrConvert template template parameter should:
+//  - be a template class with SourceOp and TargetOp type parameters
+//  - have a constructor that takes a SourceOp instance
+//  - a getAttrs() method that returns ArrayRef<NamedAttribute> containing
+//    attributes that the target operation will have
+//  - a getPropAttr() method that returns either a NULL attribute or a
+//    DictionaryAttribute with properties that exist for the target operation
 template <typename SourceOp, typename TargetOp,
           template <typename, typename> typename AttrConvert =
               AttrConvertPassThrough,
@@ -137,8 +141,8 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
 
     return LLVM::detail::vectorOneToOneRewrite(
         op, TargetOp::getOperationName(), adaptor.getOperands(),
-        attrConvert.getAttrs(), *this->getTypeConverter(), rewriter,
-        attrConvert.getOverflowFlags());
+        attrConvert.getAttrs(), attrConvert.getPropAttr(),
+        *this->getTypeConverter(), rewriter);
   }
 };
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index a38cf41a3e09b..77d780425c3c3 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -158,6 +158,18 @@ class Arith_IntBinaryOpWithOverflowFlags<string mnemonic, list<Trait> traits = [
                           attr-dict `:` type($result) }];
 }
 
+class Arith_IntBinaryOpWithExactFlag<string mnemonic, list<Trait> traits = []> :
+    Arith_BinaryOp<mnemonic, traits #
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
+    Arguments<(ins SignlessIntegerOrIndexLike:$lhs,
+               SignlessIntegerOrIndexLike:$rhs,
+               UnitAttr:$isExact)>,
+    Results<(outs SignlessIntegerOrIndexLike:$result)> {
+
+  let assemblyFormat = [{ $lhs `,` $rhs (`exact` $isExact^)?
+                          attr-dict `:` type($result) }];
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 //===----------------------------------------------------------------------===//
@@ -482,7 +494,8 @@ def Arith_MulUIExtendedOp : Arith_Op<"mului_extended", [Pure, Commutative,
 // DivUIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
+def Arith_DivUIOp : Arith_IntBinaryOpWithExactFlag<"divui",
+                                                   [ConditionallySpeculatable]> {
   let summary = "unsigned integer division operation";
   let description = [{
     Unsigned integer division. Rounds towards zero. Treats the leading bit as
@@ -493,12 +506,18 @@ def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
     `tensor` values, the behavior is undefined if _any_ elements are divided by
     zero.
 
+    If the `exact` attribute is present, the result value is poison if `lhs` is
+    not a multiple of `rhs`.
+
     Example:
 
     ```mlir
     // Scalar unsigned integer division.
     %a = arith.divui %b, %c : i64
 
+    // Scalar unsigned integer division where %b is known to be a multiple of %c.
+    %a = arith.divui %b, %c exact : i64
+
     // SIMD vector element-wise division.
     %f = arith.divui %g, %h : vector<4xi32>
 
@@ -519,7 +538,8 @@ def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
 // DivSIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_DivSIOp : Arith_IntBinaryOp<"divsi", [ConditionallySpeculatable]> {
+def Arith_DivSIOp : Arith_IntBinaryOpWithExactFlag<"divsi",
+                                                   [ConditionallySpeculatable]> {
   let summary = "signed integer division operation";
   let description = [{
     Signed integer division. Rounds towards zero. Treats the leading bit as
@@ -530,12 +550,18 @@ def Arith_DivSIOp : Arith_IntBinaryOp<"divsi", [ConditionallySpeculatable]> {
     behavior is undefined if _any_ of its elements are divided by zero or has a
     signed division overflow.
 
+    If the `exact` attribute is present, the result value is poison if `lhs` is
+    not a multiple of `rhs`.
+
     Example:
 
     ```mlir
     // Scalar signed integer division.
     %a = arith.divsi %b, %c : i64
 
+    // Scalar signed integer division where %b is known to be a multiple of %c.
+    %a = arith.divsi %b, %c exact : i64
+
     // SIMD vector element-wise division.
     %f = arith.divsi %g, %h : vector<4xi32>
 
@@ -821,7 +847,7 @@ def Arith_ShLIOp : Arith_IntBinaryOpWithOverflowFlags<"shli"> {
 // ShRUIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
+def Arith_ShRUIOp : Arith_IntBinaryOpWithExactFlag<"shrui", [Pure]> {
   let summary = "unsigned integer right-shift";
   let description = [{
     The `shrui` operation shifts an integer value of the first operand to the right
@@ -830,12 +856,17 @@ def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
     filled with zeros. If the value of the second operand is greater or equal than the
     bitwidth of the first operand, then the operation returns poison.
 
+    If the `exact` attribute is present, the result value of shrui is a poison
+    value if any of the bits shifted out are non-zero.
+
     Example:
 
     ```mlir
-    %1 = arith.constant 160 : i8               // %1 is 0b10100000
+    %1 = arith.constant 160 : i8        // %1 is 0b10100000
     %2 = arith.constant 3 : i8
-    %3 = arith.shrui %1, %2 : (i8, i8) -> i8   // %3 is 0b00010100
+    %3 = arith.constant 6 : i8
+    %4 = arith.shrui %1, %2 exact : i8  // %4 is 0b00010100
+    %5 = arith.shrui %1, %3 : i8        // %3 is 0b00000010
     ```
   }];
   let hasFolder = 1;
@@ -845,7 +876,7 @@ def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
 // ShRSIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_ShRSIOp : Arith_TotalIntBinaryOp<"shrsi"> {
+def Arith_ShRSIOp : Arith_IntBinaryOpWithExactFlag<"shrsi", [Pure]> {
   let summary = "signed integer right-shift";
   let description = [{
     The `shrsi` operation shifts an integer value of the first operand to the right
@@ -856,14 +887,17 @@ def Arith_ShRSIOp : Arith_TotalIntBinaryOp<"shrsi"> {
     operand is greater or equal than bitwidth of the first operand, then the operation
     returns poison.
 
+    If the `exact` attribute is present, the result value of shrsi is a poison
+    value if any of the bits shifted out are non-zero.
+
     Example:
 
     ```mlir
-    %1 = arith.constant 160 : i8               // %1 is 0b10100000
+    %1 = arith.constant 160 : i8         // %1 is 0b10100000
     %2 = arith.constant 3 : i8
-    %3 = arith.shrsi %1, %2 : (i8, i8) -> i8   // %3 is 0b11110100
-    %4 = arith.constant 96 : i8                   // %4 is 0b01100000
-    %5 = arith.shrsi %4, %2 : (i8, i8) -> i8   // %5 is 0b00001100
+    %3 = arith.shrsi %1, %2 exact : i8   // %3 is 0b11110100
+    %4 = arith.constant 98 : i8          // %4 is 0b01100010
+    %5 = arith.shrsi %4, %2 : i8         // %5 is 0b00001100
     ```
   }];
   let hasFolder = 1;
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index f2bacc3399144..cc3e8468f298b 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -281,6 +281,7 @@ ConstantOpLowering::matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
   return LLVM::detail::oneToOneRewrite(op, LLVM::ConstantOp::getOperationName(),
                                        adaptor.getOperands(), op->getAttrs(),
+                                       /*propAttr=*/Attribute{},
                                        *getTypeConverter(), rewriter);
 }
 
diff --git a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
index 86d02e6c6209f..6a0c21185983e 100644
--- a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
+++ b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
@@ -96,7 +96,8 @@ struct ConstantOpLowering : public ConvertOpToLLVMPattern<complex::ConstantOp> {
                   ConversionPatternRewriter &rewriter) const override {
     return LLVM::detail::oneToOneRewrite(
         op, LLVM::ConstantOp::getOperationName(), adaptor.getOperands(),
-        op->getAttrs(), *getTypeConverter(), rewriter);
+        op->getAttrs(), /*propAttr=*/Attribute{}, *getTypeConverter(),
+        rewriter);
   }
 };
 
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
index 48a03198fd465..f28a6ccb42455 100644
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -296,19 +296,13 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
 // Detail methods
 //===----------------------------------------------------------------------===//
 
-void LLVM::detail::setNativeProperties(Operation *op,
-                                       IntegerOverflowFlags overflowFlags) {
-  if (auto iface = dyn_cast<IntegerOverflowFlagsInterface>(op))
-    iface.setOverflowFlags(overflowFlags);
-}
-
 /// Replaces the given operation "op" with a new operation of type "targetOp"
 /// and given operands.
 LogicalResult LLVM::detail::oneToOneRewrite(
     Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags) {
+    ArrayRef<NamedAttribute> targetAttrs, Attribute propertiesAttr,
+    const LLVMTypeConverter &typeConverter,
+    ConversionPatternRewriter &rewriter) {
   unsigned numResults = op->getNumResults();
 
   SmallVector<Type> resultTypes;
@@ -320,11 +314,10 @@ LogicalResult LLVM::detail::oneToOneRewrite(
   }
 
   // Create the operation through state since we don't know its C++ type.
-  Operation *newOp =
-      rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp), operands,
-                      resultTypes, targetAttrs);
-
-  setNativeProperties(newOp, overflowFlags);
+  OperationState state(op->getLoc(), rewriter.getStringAttr(targetOp), operands,
+                       resultTypes, targetAttrs);
+  state.propertiesAttr = propertiesAttr;
+  Operation *newOp = rewriter.create(state);
 
   // If the operation produced 0 or 1 result, return them immediately.
   if (numResults == 0)
diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
index e7dd0b506e12d..24b01259f0499 100644
--- a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
@@ -105,9 +105,9 @@ LogicalResult LLVM::detail::handleMultidimensionalVectors(
 
 LogicalResult LLVM::detail::vectorOneToOneRewrite(
     Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags) {
+    ArrayRef<NamedAttribute> targetAttrs, Attribute propertiesAttr,
+    const LLVMTypeConverter &typeConverter,
+    ConversionPatternRewriter &rewriter) {
   assert(!operands.empty());
 
   // Cannot convert ops if their operands are not of LLVM type.
@@ -116,15 +116,14 @@ LogicalResult LLVM::detail::vectorOneToOneRewrite(
 
   auto llvmNDVectorTy = operands[0].getType();
   if (!isa<LLVM::LLVMArrayType>(llvmNDVectorTy))
-    return oneToOneRewrite(op, targetOp, operands, targetAttrs, typeConverter,
-                           rewriter, overflowFlags);
-
-  auto callback = [op, targetOp, targetAttrs, overflowFlags,
+    return oneToOneRewrite(op, targetOp, operands, targetAttrs, propertiesAttr,
+                           typeConverter, rewriter);
+  auto callback = [op, targetOp, targetAttrs, propertiesAttr,
                    &rewriter](Type llvm1DVectorTy, ValueRange operands) {
-    Operation *newOp =
-        rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp),
-                        operands, llvm1DVectorTy, targetAttrs);
-    LLVM::detail::setNativeProperties(newOp, overflowFlags);
+    OperationState state(op->getLoc(), rewriter.getStringAttr(targetOp),
+                         operands, llvm1DVectorTy, targetAttrs);
+    state.propertiesAttr = propertiesAttr;
+    Operation *newOp = rewriter.create(state);
     return newOp->getResult(0);
   };
 
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index de3efc9fe3506..e256915933a71 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -389,8 +389,8 @@ def TruncIExtUIToExtUI :
 // trunci(shrsi(x, c)) -> trunci(shrui(x, c))
 def TruncIShrSIToTrunciShrUI :
     Pat<(Arith_TruncIOp:$tr
-          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0)), $overflow),
-        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0))), $overflow),
+          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0), $exact), $overflow),
+        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0)), $exact), $overflow),
         [(TruncationMatchesShiftAmount $x, $tr, $c0)]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index 5f1ec66234df2..6fdc1104d2609 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -738,6 +738,22 @@ func.func @ops_supporting_overflow(%arg0: i64, %arg1: i64) {
 
 // -----
 
+// CHECK-LABEL: @ops_supporting_exact
+func.func @ops_supporting_exact(i32, i32) {
+^bb0(%arg0: i32, %arg1: i32):
+// CHECK: = llvm.ashr exact %arg0, %arg1 : i32
+  %0 = arith.shrsi %arg0, %arg1 exact : i32
+// CHECK: = llvm.lshr exact %arg0, %arg1 : i32
+  %1 = arith.shrui %arg0, %arg1 exact : i32
+// CHECK: = llvm.sdiv exact %arg0, %arg1 : i32
+  %2 = arith.divsi %arg0, %arg1 exact : i32
+// CHECK: = llvm.udiv exact %arg0, %arg1 : i32
+  %3 = arith.divui %arg0, %arg1 exact : i32
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_bitcast
 //  CHECK-SAME:   (%[[ARG:.*]]: memref<?xi16>)
 //       CHECK:   %[[V1:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref<?xi16> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 2fe0995c9d4df..3ad1530248809 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2958,6 +2958,19 @@ func.func @truncIShrSIToTrunciShrUI(%a: i64) -> i32 {
   return %hi : i32
 }
 
+// CHECK-LABEL: @truncIShrSIExactToTrunciShrUIExact
+//  CHECK-SAME:   (%[[A:.+]]: i64)
+//  CHECK-NEXT:   %[[C32:.+]] = arith.constant 32 : i64
+//  CHECK-NEXT:   %[[SHR:.+]] = arith.shrui %[[A]], %[[C32]] exact : i64
+//  CHECK-NEXT:   %[[TRU:.+]] = arith.trunci %[[SHR]] : i64 to i32
+//  CHECK-NEXT:   return %[[TRU]] : i32
+func.func @truncIShrSIExactToTrunciShrUIExact(%a: i64) -> i32 {
+  %c32 = arith.constant 32: i64
+  %sh = arith.shrsi %a, %c32 exact : i64
+  %hi = arith.trunci %sh: i64 to i32
+  return %hi : i32
+}
+
 // CHECK-LABEL: @truncIShrSIToTrunciShrUIBadShiftAmt1
 //       CHECK:   arith.shrsi
 func.func @truncIShrSIToTrunciShrUIBadShiftAmt1(%a: i64) -> i32 {
diff --git a/mlir/test/Dialect/Arith/ops.mlir b/mlir/test/Dialect/Arith/ops.mlir
index 1e656e84da836..58eadfda17060 100644
--- a/mlir/test/Dialect/Arith/ops.mlir
+++ b/mlir/test/Dialect/Arith/ops.mlir
@@ -151,6 +151,12 @@ func.func @test_divui(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_divui_exact
+func.func @test_divui_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.divui %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_divui_tensor
 func.func @test_divui_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.divui %arg0, %arg1 : tensor<8x8xi64>
@@ -175,6 +181,12 @@ func.func @test_divsi(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_divsi_exact
+func.func @test_divsi_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.divsi %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_divsi_tensor
 func.func @test_divsi_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.divsi %arg0, %arg1 : tensor<8x8xi64>
@@ -391,6 +403,12 @@ func.func @test_shrui(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_shrui_exact
+func.func @test_shrui_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.shrui %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_shrui_tensor
 func.func @test_shrui_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.shrui %arg0, %arg1 : tensor<8x8xi64>
@@ -415,6 +433,12 @@ func.func @test_shrsi(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_shrsi_exact
+func.func @test_shrsi_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.shrsi %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_shrsi_tensor
 func.func @test_shrsi_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.shrsi %arg0, %arg1 : tensor<8x8xi64>

From 9349a10f93308a196499d2c80a222476c78f1065 Mon Sep 17 00:00:00 2001
From: Jeremy Furtek <jfurtek@nvidia.com>
Date: Mon, 17 Nov 2025 11:46:56 -0600
Subject: [PATCH 053/105] Fix side effects for LLVM integer operations (udiv,
 sdiv) incorrectly marked as Pure (#166648)

This MR modifies side effect traits of some integer arithmetic
operations in the LLVM dialect.

Prior to this MR, the LLVM dialect `sdiv` and `udiv` operations were
marked as `Pure` through `tblgen` inheritance of the
`LLVM_ArithmeticOpBase` class. The `Pure` trait allowed incorrect
hoisting of `sdiv`/`udiv` operations by the
`loop-independent-code-motion` pass.

This MR modifies the `sdiv` and `udiv` LLVM operations to have traits
and code motion behavior identical to their counterparts in the `arith`
dialect, which were established by the commit/review below.


https://github.com/llvm/llvm-project/commit/ed39825be48805b174d3177f1d8d41ed84784d18
https://reviews.llvm.org/D137814
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  33 ++--
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  28 ++++
 .../loop-invariant-code-motion.mlir           | 145 ++++++++++++++++++
 mlir/unittests/Dialect/LLVMIR/CMakeLists.txt  |   1 +
 4 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index e425e16a4b1a6..971710fa3ee13 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -39,7 +39,7 @@ class LLVM_TerminatorOp<string mnemonic, list<Trait> traits = []> :
 class LLVM_ArithmeticOpBase<Type type, string mnemonic,
                             string instName, list<Trait> traits = []> :
     LLVM_Op<mnemonic,
-           !listconcat([Pure, SameOperandsAndResultType], traits)>,
+           !listconcat([SameOperandsAndResultType, NoMemoryEffect], traits)>,
     LLVM_Builder<"$res = builder.Create" # instName # "($lhs, $rhs);"> {
   dag commonArgs = (ins LLVM_ScalarOrVectorOf<type>:$lhs,
                     LLVM_ScalarOrVectorOf<type>:$rhs);
@@ -116,7 +116,8 @@ class LLVM_IntArithmeticOpWithDisjointFlag<string mnemonic, string instName,
 class LLVM_FloatArithmeticOp<string mnemonic, string instName,
                              list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<LLVM_AnyFloat, mnemonic, instName,
-    !listconcat([DeclareOpInterfaceMethods<FastmathFlagsInterface>], traits)> {
+    !listconcat([DeclareOpInterfaceMethods<FastmathFlagsInterface>, Pure],
+                 traits)> {
   dag fmfArg = (
     ins DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags);
   let arguments = !con(commonArgs, fmfArg);
@@ -149,24 +150,26 @@ class LLVM_UnaryFloatArithmeticOp<Type type, string mnemonic,
 
 // Integer binary operations.
 def LLVM_AddOp : LLVM_IntArithmeticOpWithOverflowFlag<"add", "Add",
-    [Commutative]>;
-def LLVM_SubOp : LLVM_IntArithmeticOpWithOverflowFlag<"sub", "Sub", []>;
+    [Commutative, Pure]>;
+def LLVM_SubOp : LLVM_IntArithmeticOpWithOverflowFlag<"sub", "Sub", [Pure]>;
 def LLVM_MulOp : LLVM_IntArithmeticOpWithOverflowFlag<"mul", "Mul",
-    [Commutative]>;
-def LLVM_UDivOp : LLVM_IntArithmeticOpWithExactFlag<"udiv", "UDiv">;
-def LLVM_SDivOp : LLVM_IntArithmeticOpWithExactFlag<"sdiv", "SDiv">;
-def LLVM_URemOp : LLVM_IntArithmeticOp<"urem", "URem">;
-def LLVM_SRemOp : LLVM_IntArithmeticOp<"srem", "SRem">;
-def LLVM_AndOp : LLVM_IntArithmeticOp<"and", "And">;
-def LLVM_OrOp : LLVM_IntArithmeticOpWithDisjointFlag<"or", "Or"> {
+    [Commutative, Pure]>;
+def LLVM_UDivOp : LLVM_IntArithmeticOpWithExactFlag<"udiv", "UDiv",
+    [DeclareOpInterfaceMethods<ConditionallySpeculatable>]>;
+def LLVM_SDivOp : LLVM_IntArithmeticOpWithExactFlag<"sdiv", "SDiv",
+    [DeclareOpInterfaceMethods<ConditionallySpeculatable>]>;
+def LLVM_URemOp : LLVM_IntArithmeticOp<"urem", "URem", [Pure]>;
+def LLVM_SRemOp : LLVM_IntArithmeticOp<"srem", "SRem", [Pure]>;
+def LLVM_AndOp : LLVM_IntArithmeticOp<"and", "And", [Pure]>;
+def LLVM_OrOp : LLVM_IntArithmeticOpWithDisjointFlag<"or", "Or", [Pure]> {
   let hasFolder = 1;
 }
-def LLVM_XOrOp : LLVM_IntArithmeticOp<"xor", "Xor">;
-def LLVM_ShlOp : LLVM_IntArithmeticOpWithOverflowFlag<"shl", "Shl", []> {
+def LLVM_XOrOp : LLVM_IntArithmeticOp<"xor", "Xor", [Pure]>;
+def LLVM_ShlOp : LLVM_IntArithmeticOpWithOverflowFlag<"shl", "Shl", [Pure]> {
   let hasFolder = 1;
 }
-def LLVM_LShrOp : LLVM_IntArithmeticOpWithExactFlag<"lshr", "LShr">;
-def LLVM_AShrOp : LLVM_IntArithmeticOpWithExactFlag<"ashr", "AShr">;
+def LLVM_LShrOp : LLVM_IntArithmeticOpWithExactFlag<"lshr", "LShr", [Pure]>;
+def LLVM_AShrOp : LLVM_IntArithmeticOpWithExactFlag<"ashr", "AShr", [Pure]>;
 
 // Base class for compare operations. A compare operation takes two operands
 // of the same type and returns a boolean result. If the operands are
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 1bf4a1c508843..5b819485b1be4 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -4223,6 +4223,34 @@ LogicalResult InlineAsmOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// UDivOp
+//===----------------------------------------------------------------------===//
+Speculation::Speculatability UDivOp::getSpeculatability() {
+  // X / 0 => UB
+  Value divisor = getRhs();
+  if (matchPattern(divisor, m_IntRangeWithoutZeroU()))
+    return Speculation::Speculatable;
+
+  return Speculation::NotSpeculatable;
+}
+
+//===----------------------------------------------------------------------===//
+// SDivOp
+//===----------------------------------------------------------------------===//
+Speculation::Speculatability SDivOp::getSpeculatability() {
+  // This function conservatively assumes that all signed division by -1 are
+  // not speculatable.
+  // X / 0 => UB
+  // INT_MIN / -1 => UB
+  Value divisor = getRhs();
+  if (matchPattern(divisor, m_IntRangeWithoutZeroS()) &&
+      matchPattern(divisor, m_IntRangeWithoutNegOneS()))
+    return Speculation::Speculatable;
+
+  return Speculation::NotSpeculatable;
+}
+
 //===----------------------------------------------------------------------===//
 // LLVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index c1604e226a334..31a4f64dd7de0 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -880,6 +880,18 @@ func.func @no_speculate_divui(
   return
 }
 
+func.func @no_speculate_udiv(
+// CHECK-LABEL: @no_speculate_udiv(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %denom : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi(
 // CHECK-LABEL: @no_speculate_divsi(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -892,6 +904,18 @@ func.func @no_speculate_divsi(
   return
 }
 
+func.func @no_speculate_sdiv(
+// CHECK-LABEL: @no_speculate_sdiv(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %denom : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui(
 // CHECK-LABEL: @no_speculate_ceildivui(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -928,6 +952,18 @@ func.func @no_speculate_divui_const(%num: i32, %lb: index, %ub: index, %step: in
   return
 }
 
+func.func @no_speculate_udiv_const(%num: i32, %lb: index, %ub: index, %step: index) {
+// CHECK-LABEL: @no_speculate_udiv_const(
+  %c0 = arith.constant 0 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %c0 : i32
+  }
+
+  return
+}
+
 func.func @speculate_divui_const(
 // CHECK-LABEL: @speculate_divui_const(
     %num: i32, %lb: index, %ub: index, %step: index) {
@@ -941,6 +977,19 @@ func.func @speculate_divui_const(
   return
 }
 
+func.func @speculate_udiv_const(
+// CHECK-LABEL: @speculate_udiv_const(
+    %num: i32, %lb: index, %ub: index, %step: index) {
+  %c5 = llvm.mlir.constant(5 : i32) : i32
+// CHECK: llvm.udiv
+// CHECK: scf.for
+  scf.for %i = %lb to %ub step %step {
+    %val = llvm.udiv %num, %c5 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui_const(%num: i32, %lb: index, %ub: index, %step: index) {
 // CHECK-LABEL: @no_speculate_ceildivui_const(
   %c0 = arith.constant 0 : i32
@@ -979,6 +1028,19 @@ func.func @no_speculate_divsi_const0(
   return
 }
 
+func.func @no_speculate_sdiv_const0(
+// CHECK-LABEL: @no_speculate_sdiv_const0(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %c0 = arith.constant 0 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %c0 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi_const_minus1(
 // CHECK-LABEL: @no_speculate_divsi_const_minus1(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -992,6 +1054,19 @@ func.func @no_speculate_divsi_const_minus1(
   return
 }
 
+func.func @no_speculate_sdiv_const_minus1(
+// CHECK-LABEL: @no_speculate_sdiv_const_minus1(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %cm1 = arith.constant -1 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %cm1 : i32
+  }
+
+  return
+}
+
 func.func @speculate_divsi_const(
 // CHECK-LABEL: @speculate_divsi_const(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -1005,6 +1080,19 @@ func.func @speculate_divsi_const(
   return
 }
 
+func.func @speculate_sdiv_const(
+// CHECK-LABEL: @speculate_sdiv_const(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %c5 = arith.constant 5 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: llvm.sdiv
+// CHECK: scf.for
+    %val = llvm.sdiv %num, %c5 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivsi_const0(
 // CHECK-LABEL: @no_speculate_ceildivsi_const0(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -1057,6 +1145,19 @@ func.func @no_speculate_divui_range(
   return
 }
 
+func.func @no_speculate_udiv_range(
+// CHECK-LABEL: @no_speculate_udiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %denom : i8
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi_range(
 // CHECK-LABEL: @no_speculate_divsi_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1072,6 +1173,21 @@ func.func @no_speculate_divsi_range(
   return
 }
 
+func.func @no_speculate_sdiv_range(
+// CHECK-LABEL: @no_speculate_sdiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom0 = test.with_bounds {smax = -1: i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  %denom1 = test.with_bounds {smax = 127 : i8, smin = 0 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK-COUNT-2: llvm.sdiv
+    %val0 = llvm.sdiv %num, %denom0 : i8
+    %val1 = llvm.sdiv %num, %denom1 : i8
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui_range(
 // CHECK-LABEL: @no_speculate_ceildivui_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1113,6 +1229,19 @@ func.func @speculate_divui_range(
   return
 }
 
+func.func @speculate_udiv_range(
+// CHECK-LABEL: @speculate_udiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 1 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: llvm.udiv
+// CHECK: scf.for
+    %val = llvm.udiv %num, %denom : i8
+  }
+
+  return
+}
+
 func.func @speculate_divsi_range(
 // CHECK-LABEL: @speculate_divsi_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1129,6 +1258,22 @@ func.func @speculate_divsi_range(
   return
 }
 
+func.func @speculate_sdiv_range(
+// CHECK-LABEL: @speculate_sdiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom0 = test.with_bounds {smax = 127 : i8, smin = 1 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  %denom1 = test.with_bounds {smax = -2 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK-COUNT-2: llvm.sdiv
+// CHECK: scf.for
+    %val0 = llvm.sdiv %num, %denom0 : i8
+    %val1 = llvm.sdiv %num, %denom1 : i8
+
+  }
+
+  return
+}
+
 func.func @speculate_ceildivui_range(
 // CHECK-LABEL: @speculate_ceildivui_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
diff --git a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
index 7cc130d02ad74..568126fd342cc 100644
--- a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
@@ -4,4 +4,5 @@ add_mlir_unittest(MLIRLLVMIRTests
 mlir_target_link_libraries(MLIRLLVMIRTests
   PRIVATE
   MLIRLLVMDialect
+  MLIRInferIntRangeInterface
   )

From a7579fda53b55ae7d7d064d08e58b1269420095d Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Mon, 17 Nov 2025 12:51:04 -0500
Subject: [PATCH 054/105] [PowerPC][AIX] Remove flag for no semantic
 interposition (#168109)

Remove flag to sepecifcy "no semantic interposition" since this is the
default for AIX.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 22ecf4dcee368..fdd3509f03f59 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -450,13 +450,16 @@ if( LLVM_ENABLE_PIC )
     # Enable interprocedural optimizations for non-inline functions which would
     # otherwise be disabled due to GCC -fPIC's default.
     # Note: GCC<10.3 has a bug on SystemZ.
-    #
+    # Note: Default on AIX is "no semantic interposition".
     # Note: Clang allows IPO for -fPIC so this optimization is less effective.
     # Clang 13 has a bug related to -fsanitize-coverage
     # -fno-semantic-interposition (https://reviews.llvm.org/D117183).
-    if ((CMAKE_COMPILER_IS_GNUCXX AND
-         NOT (LLVM_NATIVE_ARCH STREQUAL "SystemZ" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.3))
-       OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 14))
+    if ((NOT ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX"))
+        AND ((CMAKE_COMPILER_IS_GNUCXX AND
+              NOT (LLVM_NATIVE_ARCH STREQUAL "SystemZ"
+                   AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.3))
+             OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"
+                 AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 14)))
       add_flag_if_supported("-fno-semantic-interposition" FNO_SEMANTIC_INTERPOSITION)
     endif()
   endif()

From 72059bebb3a9427dc70723a37e4c38adfa44553a Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 09:53:16 -0800
Subject: [PATCH 055/105] [compiler-rt][Profile] Mark Darwin test work with
 internal shell

This test was using subshells and then passing the results to diff. Write out
the results to files before passing to diff as the internal shell does not
support subshells.
---
 .../Darwin/instrprof-debug-info-correlate.c      | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
index 46d25a4e386dc..1e9bd11d3f49c 100644
--- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
@@ -7,7 +7,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
 // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.functions
+// RUN: diff %t.normal.functions %t.functions
 
 // RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
@@ -17,7 +19,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.cov.profraw %run %t.cov.normal
 // RUN: llvm-profdata merge -o %t.cov.normal.profdata %t.cov.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.functions
+// RUN: diff %t.cov.normal.functions %t.cov.functions
 
 // Test debug info correlate with online merging.
 
@@ -30,11 +34,15 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.functions
+// RUN: diff %t.normal.functions %t.functions
 
 // RUN: rm -rf %t.profdir && mkdir %t.profdir
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov.dSYM %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.functions
+// RUN: diff %t.cov.normal.functions %t.cov.functions

From 05bd742ad790f207f5c94c4bf327d3e87b8819dc Mon Sep 17 00:00:00 2001
From: Andres-Salamanca <andrealebarbaritos@gmail.com>
Date: Mon, 17 Nov 2025 13:01:04 -0500
Subject: [PATCH 056/105] [CIR] Upstream the initial BlockAddressOp
 implementation (#168151)

This PR adds initial support for codegen of `blockAddressOp`. This is
emitted when using the GNU extension labels as values. The operation is
used together with `indirectBrOp`, which will be implemented in a future
PR. Lowering will be added in a later PR.
---
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 25 ++++++
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 34 +++++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  9 +++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 29 ++++++-
 .../lib/CIR/Dialect/Transforms/GotoSolver.cpp | 17 ++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  6 ++
 clang/test/CIR/CodeGen/label-values.c         | 76 +++++++++++++++++++
 clang/test/CIR/IR/block-adress.cir            | 34 +++++++++
 clang/test/CIR/IR/invalid-block-address.cir   | 21 +++++
 clang/test/CIR/Transforms/goto_solver.cir     | 62 +++++++++++++++
 clang/tools/cir-opt/cir-opt.cpp               |  4 +
 11 files changed, 312 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/label-values.c
 create mode 100644 clang/test/CIR/IR/block-adress.cir
 create mode 100644 clang/test/CIR/IR/invalid-block-address.cir
 create mode 100644 clang/test/CIR/Transforms/goto_solver.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 1e0fb038b19d8..47ff9389e8028 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -1026,4 +1026,29 @@ def CIR_UnwindAttr : CIR_UnitAttr<"Unwind", "unwind"> {
   let storageType = [{ CatchUnwind }];
 }
 
+//===----------------------------------------------------------------------===//
+// CIR_BlockAddrInfoAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_BlockAddrInfoAttr : CIR_Attr<"BlockAddrInfo", "block_addr_info"> {
+  let summary = "Block Addres attribute";
+  let description = [{
+    This attribute is used to represent the address of a basic block
+    within a function. It combines the symbol reference to a function
+    with the name of a label inside that function.
+  }];
+  let parameters = (ins "mlir::FlatSymbolRefAttr":$func,
+                        "mlir::StringAttr":$label);
+
+  let assemblyFormat = "`<` $func `,` $label `>`";
+  let builders = [
+    AttrBuilder<(ins "llvm::StringRef":$func_name,
+                     "llvm::StringRef":$label_name
+                     ), [{
+      return $_get($_ctxt, mlir::FlatSymbolRefAttr::get($_ctxt, func_name),
+                   mlir::StringAttr::get($_ctxt, label_name));
+    }]>
+  ];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 7b987ea49bf97..e612d6a0ba886 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4897,4 +4897,38 @@ def CIR_AtomicClearOp : CIR_Op<"atomic.clear"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// BlockAddressOp
+//===----------------------------------------------------------------------===//
+
+def CIR_BlockAddressOp : CIR_Op<"block_address", [Pure]> {
+  let summary = "Get the address of a cir.label within a function";
+  let description = [{
+    The `cir.blockaddress` operation takes a function name and a label and
+    produces a pointer value that represents the address of that cir.label
+    within the specified function.
+
+    This operation models GCC's "labels as values" extension (`&&label`), which
+    allows taking the address of a local label and using it as a computed
+    jump target (e.g., with `goto *addr;`).
+
+    Example:
+    ```mlir
+    %1 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init]
+                                                          {alignment = 8 : i64}
+    %addr = cir.block_address <@c, "label1"> : !cir.ptr<!cir.void>
+    cir.store align(8) %addr, %1 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+    cir.br ^bb1
+   ^bb1:
+    cir.label "label"
+    ```
+  }];
+
+  let arguments = (ins CIR_BlockAddrInfoAttr:$block_addr_info);
+  let results = (outs CIR_VoidPtrType:$addr);
+  let assemblyFormat = [{
+    $block_addr_info `:` qualified(type($addr)) attr-dict
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 3b0977d213325..f777562ba6309 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -168,6 +168,15 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return emitLoadOfLValue(e);
   }
 
+  mlir::Value VisitAddrLabelExpr(const AddrLabelExpr *e) {
+    auto func = cast<cir::FuncOp>(cgf.curFn);
+    auto blockInfoAttr = cir::BlockAddrInfoAttr::get(
+        &cgf.getMLIRContext(), func.getSymName(), e->getLabel()->getName());
+    return cir::BlockAddressOp::create(builder, cgf.getLoc(e->getSourceRange()),
+                                       cgf.convertType(e->getType()),
+                                       blockInfoAttr);
+  }
+
   mlir::Value VisitIntegerLiteral(const IntegerLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()),
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 9ac5efe0e41c7..22aada882defc 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1912,22 +1912,45 @@ mlir::LogicalResult cir::FuncOp::verify() {
 
   llvm::SmallSet<llvm::StringRef, 16> labels;
   llvm::SmallSet<llvm::StringRef, 16> gotos;
-
+  llvm::SmallSet<llvm::StringRef, 16> blockAddresses;
+  bool invalidBlockAddress = false;
   getOperation()->walk([&](mlir::Operation *op) {
     if (auto lab = dyn_cast<cir::LabelOp>(op)) {
       labels.insert(lab.getLabel());
     } else if (auto goTo = dyn_cast<cir::GotoOp>(op)) {
       gotos.insert(goTo.getLabel());
+    } else if (auto blkAdd = dyn_cast<cir::BlockAddressOp>(op)) {
+      if (blkAdd.getBlockAddrInfoAttr().getFunc().getAttr() != getSymName()) {
+        // Stop the walk early, no need to continue
+        invalidBlockAddress = true;
+        return mlir::WalkResult::interrupt();
+      }
+      blockAddresses.insert(blkAdd.getBlockAddrInfoAttr().getLabel());
     }
+    return mlir::WalkResult::advance();
   });
 
+  if (invalidBlockAddress)
+    return emitOpError() << "blockaddress references a different function";
+
+  llvm::SmallSet<llvm::StringRef, 16> mismatched;
   if (!labels.empty() || !gotos.empty()) {
-    llvm::SmallSet<llvm::StringRef, 16> mismatched =
-        llvm::set_difference(gotos, labels);
+    mismatched = llvm::set_difference(gotos, labels);
 
     if (!mismatched.empty())
       return emitOpError() << "goto/label mismatch";
   }
+
+  mismatched.clear();
+
+  if (!labels.empty() || !blockAddresses.empty()) {
+    mismatched = llvm::set_difference(blockAddresses, labels);
+
+    if (!mismatched.empty())
+      return emitOpError()
+             << "expects an existing label target in the referenced function";
+  }
+
   return success();
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
index 00972b6976295..d590ccce1f540 100644
--- a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
@@ -8,6 +8,7 @@
 #include "PassDetail.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/Passes.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <memory>
 
@@ -30,17 +31,29 @@ static void process(cir::FuncOp func) {
   mlir::OpBuilder rewriter(func.getContext());
   llvm::StringMap<Block *> labels;
   llvm::SmallVector<cir::GotoOp, 4> gotos;
+  llvm::SmallSet<StringRef, 4> blockAddrLabel;
 
   func.getBody().walk([&](mlir::Operation *op) {
     if (auto lab = dyn_cast<cir::LabelOp>(op)) {
-      // Will construct a string copy inplace. Safely erase the label
       labels.try_emplace(lab.getLabel(), lab->getBlock());
-      lab.erase();
     } else if (auto goTo = dyn_cast<cir::GotoOp>(op)) {
       gotos.push_back(goTo);
+    } else if (auto blockAddr = dyn_cast<cir::BlockAddressOp>(op)) {
+      blockAddrLabel.insert(blockAddr.getBlockAddrInfo().getLabel());
     }
   });
 
+  for (auto &lab : labels) {
+    StringRef labelName = lab.getKey();
+    Block *block = lab.getValue();
+    if (!blockAddrLabel.contains(labelName)) {
+      // erase the LabelOp inside the block if safe
+      if (auto lab = dyn_cast<cir::LabelOp>(&block->front())) {
+        lab.erase();
+      }
+    }
+  }
+
   for (auto goTo : gotos) {
     mlir::OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(goTo);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 92434d730eb31..d43a462a25092 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -3837,6 +3837,12 @@ mlir::LogicalResult CIRToLLVMVAArgOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMBlockAddressOpLowering::matchAndRewrite(
+    cir::BlockAddressOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return mlir::failure();
+}
+
 std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
   return std::make_unique<ConvertCIRToLLVMPass>();
 }
diff --git a/clang/test/CIR/CodeGen/label-values.c b/clang/test/CIR/CodeGen/label-values.c
new file mode 100644
index 0000000000000..41178e3f62f20
--- /dev/null
+++ b/clang/test/CIR/CodeGen/label-values.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir  %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+
+void A(void) {
+  void *ptr = &&LABEL_A;
+LABEL_A:
+  return;
+}
+// CIR:  cir.func dso_local @A
+// CIR:    [[PTR:%.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+// CIR:    [[BLOCK:%.*]] = cir.block_address <@A, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) [[BLOCK]], [[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    cir.return
+
+void B(void) {
+LABEL_B:
+  void *ptr = &&LABEL_B;
+}
+
+// CIR:  cir.func dso_local @B()
+// CIR:    [[PTR:%.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+// CIR:    cir.br ^bb1
+// CIR:   ^bb1:
+// CIR:    cir.label "LABEL_B"
+// CIR:    [[BLOCK:%.*]] = cir.block_address <@B, "LABEL_B"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) [[BLOCK]], [[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.return
+
+void C(int x) {
+    void *ptr = (x == 0) ? &&LABEL_A : &&LABEL_B;
+LABEL_A:
+    return;
+LABEL_B:
+    return;
+}
+
+// CIR:  cir.func dso_local @C
+// CIR:    [[BLOCK1:%.*]] = cir.block_address <@C, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    [[BLOCK2:%.*]] = cir.block_address <@C, "LABEL_B"> : !cir.ptr<!void>
+// CIR:    [[COND:%.*]] = cir.select if [[CMP:%.*]] then [[BLOCK1]] else [[BLOCK2]] : (!cir.bool, !cir.ptr<!void>, !cir.ptr<!void>) -> !cir.ptr<!void>
+// CIR:    cir.store align(8) [[COND]], [[PTR:%.*]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    cir.br ^bb2
+// CIR:  ^bb2:  // 2 preds: ^bb1, ^bb3
+// CIR:    cir.return
+// CIR:  ^bb3:  // no predecessors
+// CIR:    cir.label "LABEL_B"
+// CIR:    cir.br ^bb2
+
+void D(void) {
+  void *ptr = &&LABEL_A;
+  void *ptr2 = &&LABEL_A;
+LABEL_A:
+  void *ptr3 = &&LABEL_A;
+  return;
+}
+
+// CIR:  cir.func dso_local @D
+// CIR:    %[[PTR:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init]
+// CIR:    %[[PTR2:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr2", init]
+// CIR:    %[[PTR3:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr3", init]
+// CIR:    %[[BLK1:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK1]], %[[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    %[[BLK2:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK2]], %[[PTR2]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    %[[BLK3:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK3]], %[[PTR3]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.return
diff --git a/clang/test/CIR/IR/block-adress.cir b/clang/test/CIR/IR/block-adress.cir
new file mode 100644
index 0000000000000..9d6840819c2d4
--- /dev/null
+++ b/clang/test/CIR/IR/block-adress.cir
@@ -0,0 +1,34 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!void = !cir.void
+
+module {
+  cir.func @block_address(){
+    %0 = cir.block_address <@block_address, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "label"
+    cir.return
+  }
+// CHECK: cir.func @block_address
+// CHECK: %0 = cir.block_address <@block_address, "label"> : !cir.ptr<!void>
+// CHECK:   cir.br ^bb1
+// CHECK: ^bb1:
+// CHECK:   cir.label "label"
+// CHECK:   cir.return
+
+cir.func @block_address_inside_scope() -> () {
+  cir.scope{
+    %0 = cir.block_address <@block_address_inside_scope, "label"> : !cir.ptr<!void>
+  }
+  cir.br ^bb1
+^bb1:
+  cir.label "label"
+  cir.return
+}
+// CHECK: cir.func @block_address_inside_scope
+// CHECK: cir.scope
+// CHECK:  %0 = cir.block_address <@block_address_inside_scope, "label"> : !cir.ptr<!void>
+// CHECK:  cir.label "label"
+// CHECK: cir.return
+}
diff --git a/clang/test/CIR/IR/invalid-block-address.cir b/clang/test/CIR/IR/invalid-block-address.cir
new file mode 100644
index 0000000000000..4519485c28803
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-block-address.cir
@@ -0,0 +1,21 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!void = !cir.void
+
+// expected-error@+1 {{expects an existing label target in the referenced function}}
+cir.func @bad_block_address() -> () {
+    %0 = cir.block_address <@bad_block_address, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "wrong_label"
+    cir.return
+}
+
+// expected-error@+1 {{blockaddress references a different function}}
+cir.func @bad_block_func() -> () {
+    %0 = cir.block_address <@mismatch_func, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "label"
+    cir.return
+}
diff --git a/clang/test/CIR/Transforms/goto_solver.cir b/clang/test/CIR/Transforms/goto_solver.cir
new file mode 100644
index 0000000000000..6ae019b44a39e
--- /dev/null
+++ b/clang/test/CIR/Transforms/goto_solver.cir
@@ -0,0 +1,62 @@
+// RUN: cir-opt %s -cir-goto-solver --verify-roundtrip -o - | FileCheck %s
+
+!void = !cir.void
+
+cir.func @a(){
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@a, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.br ^bb1
+^bb1:
+  cir.label "label1"
+  cir.br ^bb2
+^bb2:
+  // This label is not referenced by any blockaddressOp, so it should be removed
+  cir.label "label2"
+  cir.return
+}
+
+// CHECK:  cir.func @a()
+// CHECK:   %1 = cir.block_address <@a, "label1"> : !cir.ptr<!void>
+// CHECK: ^bb1:
+// CHECK:   cir.label "label1"
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb2:
+// CHECK-NOT: cir.label "label2"
+
+cir.func @b(){
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@b, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.goto "label2"
+^bb1:
+  cir.label "label1"
+  cir.br ^bb2
+^bb2:
+  // This label is not referenced by any blockaddressOp, so it should be removed
+  cir.label "label2"
+  cir.return
+}
+
+// CHECK: cir.func @b() {
+// CHECK:   %1 = cir.block_address <@b, "label1"> : !cir.ptr<!void>
+// CHECK:   cir.store align(8) %1, {{.*}} : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb1:
+// CHECK:   cir.label "label1"
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb2:
+// CHECK-NOT: cir.label "label2"
+
+cir.func @c() {
+  cir.label "label1"
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@c, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.return
+}
+
+// CHECK: cir.func @c
+// CHECK:   cir.label "label1"
+// CHECK:   %1 = cir.block_address <@c, "label1"> : !cir.ptr<!void>
+// CHECK:   cir.store align(8) %1, {{.*}} : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
diff --git a/clang/tools/cir-opt/cir-opt.cpp b/clang/tools/cir-opt/cir-opt.cpp
index c4d29a2117c75..ee42015bb38e9 100644
--- a/clang/tools/cir-opt/cir-opt.cpp
+++ b/clang/tools/cir-opt/cir-opt.cpp
@@ -58,6 +58,10 @@ int main(int argc, char **argv) {
     return mlir::createHoistAllocasPass();
   });
 
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return mlir::createGotoSolverPass();
+  });
+
   mlir::registerTransformsPasses();
 
   return mlir::asMainReturnCode(MlirOptMain(

From 1425d75c7116c33b084f49eda1c12b299b342315 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 10:06:45 -0800
Subject: [PATCH 057/105] [X86] Delete Profile Guided Prefetch Passes (#167317)

As the PGPF effort has been turned down, there is no current way to
generate profiles that will be used by these passes. Current efforts are
also focused around inserting prefetches in PLO optimizers, which have a
more accurate view of how the code looks.
---
 llvm/lib/Target/X86/CMakeLists.txt            |   2 -
 llvm/lib/Target/X86/X86.h                     |   7 -
 llvm/lib/Target/X86/X86DiscriminateMemOps.cpp | 184 -------------
 llvm/lib/Target/X86/X86InsertPrefetch.cpp     | 259 ------------------
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   2 -
 llvm/test/CodeGen/X86/O0-pipeline.ll          |   2 -
 .../X86/discriminate-mem-ops-missing-info.ll  |  55 ----
 .../X86/discriminate-mem-ops-skip-pfetch.ll   |  68 -----
 llvm/test/CodeGen/X86/discriminate-mem-ops.ll |  55 ----
 .../CodeGen/X86/insert-prefetch-inline.afdo   |   4 -
 .../CodeGen/X86/insert-prefetch-inline.ll     |  76 -----
 .../X86/insert-prefetch-invalid-instr.afdo    |   2 -
 .../X86/insert-prefetch-invalid-instr.ll      |  41 ---
 .../CodeGen/X86/insert-prefetch-other.afdo    |   3 -
 llvm/test/CodeGen/X86/insert-prefetch.afdo    |   3 -
 llvm/test/CodeGen/X86/insert-prefetch.ll      | 101 -------
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   2 -
 17 files changed, 866 deletions(-)
 delete mode 100644 llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
 delete mode 100644 llvm/lib/Target/X86/X86InsertPrefetch.cpp
 delete mode 100644 llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
 delete mode 100644 llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
 delete mode 100644 llvm/test/CodeGen/X86/discriminate-mem-ops.ll
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-inline.ll
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-other.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch.ll

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..434a6d2c3553f 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -31,7 +31,6 @@ set(sources
   X86CmovConversion.cpp
   X86CodeGenPassBuilder.cpp
   X86DomainReassignment.cpp
-  X86DiscriminateMemOps.cpp
   X86LowerTileCopy.cpp
   X86LowerAMXType.cpp
   X86LowerAMXIntrinsics.cpp
@@ -57,7 +56,6 @@ set(sources
   X86IndirectBranchTracking.cpp
   X86IndirectThunks.cpp
   X86InterleavedAccess.cpp
-  X86InsertPrefetch.cpp
   X86InstCombineIntrinsic.cpp
   X86InstrFMA3Info.cpp
   X86InstrFoldTables.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 03706aaaab237..97848bec7127e 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -166,13 +166,6 @@ FunctionPass *createX86IndirectThunksPass();
 /// This pass replaces ret instructions with jmp's to __x86_return thunk.
 FunctionPass *createX86ReturnThunksPass();
 
-/// This pass ensures instructions featuring a memory operand
-/// have distinctive <LineNumber, Discriminator> (with respect to each other)
-FunctionPass *createX86DiscriminateMemOpsPass();
-
-/// This pass applies profiling information to insert cache prefetches.
-FunctionPass *createX86InsertPrefetchPass();
-
 /// This pass insert wait instruction after X87 instructions which could raise
 /// fp exceptions when strict-fp enabled.
 FunctionPass *createX86InsertX87waitPass();
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
deleted file mode 100644
index bd151a450394a..0000000000000
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// This pass aids profile-driven cache prefetch insertion by ensuring all
-/// instructions that have a memory operand are distinguishible from each other.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/Debug.h"
-#include <optional>
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-discriminate-memops"
-
-static cl::opt<bool> EnableDiscriminateMemops(
-    DEBUG_TYPE, cl::init(false),
-    cl::desc("Generate unique debug info for each instruction with a memory "
-             "operand. Should be enabled for profile-driven cache prefetching, "
-             "both in the build of the binary being profiled, as well as in "
-             "the build of the binary consuming the profile."),
-    cl::Hidden);
-
-static cl::opt<bool> BypassPrefetchInstructions(
-    "x86-bypass-prefetch-instructions", cl::init(true),
-    cl::desc("When discriminating instructions with memory operands, ignore "
-             "prefetch instructions. This ensures the other memory operand "
-             "instructions have the same identifiers after inserting "
-             "prefetches, allowing for successive insertions."),
-    cl::Hidden);
-
-namespace {
-
-using Location = std::pair<StringRef, unsigned>;
-
-Location diToLocation(const DILocation *Loc) {
-  return std::make_pair(Loc->getFilename(), Loc->getLine());
-}
-
-/// Ensure each instruction having a memory operand has a distinct <LineNumber,
-/// Discriminator> pair.
-void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) {
-  DebugLoc DL(Loc);
-  MI->setDebugLoc(DL);
-}
-
-class X86DiscriminateMemOps : public MachineFunctionPass {
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  StringRef getPassName() const override {
-    return "X86 Discriminate Memory Operands";
-  }
-
-public:
-  static char ID;
-
-  /// Default construct and initialize the pass.
-  X86DiscriminateMemOps();
-};
-
-bool IsPrefetchOpcode(unsigned Opcode) {
-  return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
-         Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2 ||
-         Opcode == X86::PREFETCHIT0 || Opcode == X86::PREFETCHIT1 ||
-         Opcode == X86::PREFETCHRST2;
-}
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char X86DiscriminateMemOps::ID = 0;
-
-/// Default construct and initialize the pass.
-X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
-
-bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
-  if (!EnableDiscriminateMemops)
-    return false;
-
-  DISubprogram *FDI = MF.getFunction().getSubprogram();
-  if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
-    return false;
-
-  // Have a default DILocation, if we find instructions with memops that don't
-  // have any debug info.
-  const DILocation *ReferenceDI =
-      DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
-  assert(ReferenceDI && "ReferenceDI should not be nullptr");
-  DenseMap<Location, unsigned> MemOpDiscriminators;
-  MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
-
-  // Figure out the largest discriminator issued for each Location. When we
-  // issue new discriminators, we can thus avoid issuing discriminators
-  // belonging to instructions that don't have memops. This isn't a requirement
-  // for the goals of this pass, however, it avoids unnecessary ambiguity.
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      const auto &DI = MI.getDebugLoc();
-      if (!DI)
-        continue;
-      if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
-        continue;
-      Location Loc = diToLocation(DI);
-      unsigned &Disc = MemOpDiscriminators[Loc];
-      Disc = std::max(Disc, DI->getBaseDiscriminator());
-    }
-  }
-
-  // Keep track of the discriminators seen at each Location. If an instruction's
-  // DebugInfo has a Location and discriminator we've already seen, replace its
-  // discriminator with a new one, to guarantee uniqueness.
-  DenseMap<Location, DenseSet<unsigned>> Seen;
-
-  bool Changed = false;
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
-        continue;
-      if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
-        continue;
-      const DILocation *DI = MI.getDebugLoc();
-      bool HasDebug = DI;
-      if (!HasDebug) {
-        DI = ReferenceDI;
-      }
-      Location L = diToLocation(DI);
-      DenseSet<unsigned> &Set = Seen[L];
-      const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
-          Set.insert(DI->getBaseDiscriminator());
-      if (!TryInsert.second || !HasDebug) {
-        unsigned BF, DF, CI = 0;
-        DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
-        std::optional<unsigned> EncodedDiscriminator =
-            DILocation::encodeDiscriminator(MemOpDiscriminators[L] + 1, DF, CI);
-
-        if (!EncodedDiscriminator) {
-          // FIXME(mtrofin): The assumption is that this scenario is infrequent/OK
-          // not to support. If evidence points otherwise, we can explore synthesizeing
-          // unique DIs by adding fake line numbers, or by constructing 64 bit
-          // discriminators.
-          LLVM_DEBUG(dbgs() << "Unable to create a unique discriminator "
-                     "for instruction with memory operand in: "
-                     << DI->getFilename() << " Line: " << DI->getLine()
-                     << " Column: " << DI->getColumn()
-                     << ". This is likely due to a large macro expansion. \n");
-          continue;
-        }
-        // Since we were able to encode, bump the MemOpDiscriminators.
-        ++MemOpDiscriminators[L];
-        DI = DI->cloneWithDiscriminator(*EncodedDiscriminator);
-        assert(DI && "DI should not be nullptr");
-        updateDebugInfo(&MI, DI);
-        Changed = true;
-        std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
-            Set.insert(DI->getBaseDiscriminator());
-        (void)MustInsert; // Silence warning in release build.
-        assert(MustInsert.second && "New discriminator shouldn't be present in set");
-      }
-
-      // Bump the reference DI to avoid cramming discriminators on line 0.
-      // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI
-      // in a block. It's more consistent than just relying on the last memop
-      // instruction we happened to see.
-      ReferenceDI = DI;
-    }
-  }
-  return Changed;
-}
-
-FunctionPass *llvm::createX86DiscriminateMemOpsPass() {
-  return new X86DiscriminateMemOps();
-}
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
deleted file mode 100644
index 953b755a0ca4c..0000000000000
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass applies cache prefetch instructions based on a profile. The pass
-// assumes DiscriminateMemOps ran immediately before, to ensure debug info
-// matches the one used at profile generation time. The profile is encoded in
-// afdo format (text or binary). It contains prefetch hints recommendations.
-// Each recommendation is made in terms of debug info locations, a type (i.e.
-// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a
-// memory operand (see X86DiscriminateMemOps). The prefetch will be made for
-// a location at that memory operand + the delta specified in the
-// recommendation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Transforms/IPO/SampleProfile.h"
-using namespace llvm;
-using namespace sampleprof;
-
-static cl::opt<std::string>
-    PrefetchHintsFile("prefetch-hints-file",
-                      cl::desc("Path to the prefetch hints profile. See also "
-                               "-x86-discriminate-memops"),
-                      cl::Hidden);
-namespace {
-
-class X86InsertPrefetch : public MachineFunctionPass {
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool doInitialization(Module &) override;
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  struct PrefetchInfo {
-    unsigned InstructionID;
-    int64_t Delta;
-  };
-  typedef SmallVectorImpl<PrefetchInfo> Prefetches;
-  bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI,
-                        Prefetches &prefetches) const;
-
-public:
-  static char ID;
-  X86InsertPrefetch(const std::string &PrefetchHintsFilename);
-  StringRef getPassName() const override {
-    return "X86 Insert Cache Prefetches";
-  }
-
-private:
-  std::string Filename;
-  std::unique_ptr<SampleProfileReader> Reader;
-};
-
-using PrefetchHints = SampleRecord::CallTargetMap;
-
-// Return any prefetching hints for the specified MachineInstruction. The hints
-// are returned as pairs (name, delta).
-ErrorOr<const PrefetchHints &>
-getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) {
-  if (const auto &Loc = MI.getDebugLoc())
-    if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
-      return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
-                                          Loc->getBaseDiscriminator());
-  return std::error_code();
-}
-
-// The prefetch instruction can't take memory operands involving vector
-// registers.
-bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
-  Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
-  Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
-  return (BaseReg == 0 ||
-          X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
-          X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
-         (IndexReg == 0 ||
-          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
-          X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg));
-}
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char X86InsertPrefetch::ID = 0;
-
-X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename)
-    : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {}
-
-/// Return true if the provided MachineInstruction has cache prefetch hints. In
-/// that case, the prefetch hints are stored, in order, in the Prefetches
-/// vector.
-bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
-                                         const MachineInstr &MI,
-                                         Prefetches &Prefetches) const {
-  assert(Prefetches.empty() &&
-         "Expected caller passed empty PrefetchInfo vector.");
-
-  // There is no point to match prefetch hints if the profile is using MD5.
-  if (FunctionSamples::UseMD5)
-    return false;
-
-  static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
-      {"_nta_", X86::PREFETCHNTA},
-      {"_t0_", X86::PREFETCHT0},
-      {"_t1_", X86::PREFETCHT1},
-      {"_t2_", X86::PREFETCHT2},
-  };
-  static const char *SerializedPrefetchPrefix = "__prefetch";
-
-  auto T = getPrefetchHints(TopSamples, MI);
-  if (!T)
-    return false;
-  int16_t max_index = -1;
-  // Convert serialized prefetch hints into PrefetchInfo objects, and populate
-  // the Prefetches vector.
-  for (const auto &S_V : *T) {
-    StringRef Name = S_V.first.stringRef();
-    if (Name.consume_front(SerializedPrefetchPrefix)) {
-      int64_t D = static_cast<int64_t>(S_V.second);
-      unsigned IID = 0;
-      for (const auto &HintType : HintTypes) {
-        if (Name.consume_front(HintType.first)) {
-          IID = HintType.second;
-          break;
-        }
-      }
-      if (IID == 0)
-        return false;
-      uint8_t index = 0;
-      Name.consumeInteger(10, index);
-
-      if (index >= Prefetches.size())
-        Prefetches.resize(index + 1);
-      Prefetches[index] = {IID, D};
-      max_index = std::max(max_index, static_cast<int16_t>(index));
-    }
-  }
-  assert(max_index + 1 >= 0 &&
-         "Possible overflow: max_index + 1 should be positive.");
-  assert(static_cast<size_t>(max_index + 1) == Prefetches.size() &&
-         "The number of prefetch hints received should match the number of "
-         "PrefetchInfo objects returned");
-  return !Prefetches.empty();
-}
-
-bool X86InsertPrefetch::doInitialization(Module &M) {
-  if (Filename.empty())
-    return false;
-
-  LLVMContext &Ctx = M.getContext();
-  // TODO: Propagate virtual file system into LLVM targets.
-  auto FS = vfs::getRealFileSystem();
-  ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr =
-      SampleProfileReader::create(Filename, Ctx, *FS);
-  if (std::error_code EC = ReaderOrErr.getError()) {
-    std::string Msg = "Could not open profile: " + EC.message();
-    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg,
-                                             DiagnosticSeverity::DS_Warning));
-    return false;
-  }
-  Reader = std::move(ReaderOrErr.get());
-  Reader->read();
-  return true;
-}
-
-void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
-  if (!Reader)
-    return false;
-  const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction());
-  if (!Samples)
-    return false;
-
-  bool Changed = false;
-
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  SmallVector<PrefetchInfo, 4> Prefetches;
-  for (auto &MBB : MF) {
-    for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) {
-      auto Current = MI;
-      ++MI;
-
-      int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags);
-      if (Offset < 0)
-        continue;
-      unsigned Bias = X86II::getOperandBias(Current->getDesc());
-      int MemOpOffset = Offset + Bias;
-      // FIXME(mtrofin): ORE message when the recommendation cannot be taken.
-      if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset))
-        continue;
-      Prefetches.clear();
-      if (!findPrefetchInfo(Samples, *Current, Prefetches))
-        continue;
-      assert(!Prefetches.empty() &&
-             "The Prefetches vector should contain at least a value if "
-             "findPrefetchInfo returned true.");
-      for (auto &PrefInfo : Prefetches) {
-        unsigned PFetchInstrID = PrefInfo.InstructionID;
-        int64_t Delta = PrefInfo.Delta;
-        const MCInstrDesc &Desc = TII->get(PFetchInstrID);
-        MachineInstr *PFetch =
-            MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
-        MachineInstrBuilder MIB(MF, PFetch);
-
-        static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
-                          X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
-                          X86::AddrSegmentReg == 4,
-                      "Unexpected change in X86 operand offset order.");
-
-        // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
-        // FIXME(mtrofin): consider adding a:
-        //     MachineInstrBuilder::set(unsigned offset, op).
-        MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg())
-            .addImm(
-                Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm())
-            .addReg(
-                Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg())
-            .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() +
-                    Delta)
-            .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg)
-                        .getReg());
-
-        if (!Current->memoperands_empty()) {
-          MachineMemOperand *CurrentOp = *(Current->memoperands_begin());
-          MIB.addMemOperand(MF.getMachineMemOperand(
-              CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize()));
-        }
-
-        // Insert before Current. This is because Current may clobber some of
-        // the registers used to describe the input memory operand.
-        MBB.insert(Current, PFetch);
-        Changed = true;
-      }
-    }
-  }
-  return Changed;
-}
-
-FunctionPass *llvm::createX86InsertPrefetchPass() {
-  return new X86InsertPrefetch(PrefetchHintsFile);
-}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 543220b2fd3b9..713df63479987 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -563,8 +563,6 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86FixupVectorConstants());
   }
   addPass(createX86CompressEVEXPass());
-  addPass(createX86DiscriminateMemOpsPass());
-  addPass(createX86InsertPrefetchPass());
   addPass(createX86InsertX87waitPass());
 }
 
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 0fbfb42d2a4dd..78a02b11b17bb 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -68,8 +68,6 @@
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       Compressing EVEX instrs when possible
-; CHECK-NEXT:       X86 Discriminate Memory Operands
-; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       Remove Loads Into Fake Uses
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
deleted file mode 100644
index 6bbf3eb307da3..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom
-  %0 = load i32, ptr %arrayidx, align 4
-  %idxprom1 = sext i32 %pos2 to i64
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       .loc 1 1 0 {{.*}} discriminator 2
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
deleted file mode 100644
index ca412c590b2e3..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-; RUN: llc -x86-discriminate-memops  -x86-bypass-prefetch-instructions=0 < %s | FileCheck %s -check-prefix=NOBYPASS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !9
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
-  call void @llvm.prefetch(ptr %arrayidx2, i32 0, i32 3, i32 1)
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       prefetcht0 (%rdi,%rax,4)
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
-
-;NOBYPASS-LABEL: sum:
-;NOBYPASS:       # %bb.0:
-;NOBYPASS:       prefetcht0 (%rdi,%rax,4)
-;NOBYPASS-NEXT: .loc 1 2 22
-;NOBYPASS-NEXT:  movl (%rdi,%rax,4), %eax
-;NOBYPASS-NEXT:  .loc 1 2 20 {{.*}} discriminator 2  # test.cc:2:20
-;NOBYPASS-NEXT:  addl (%rdi,%rcx,4), %eax
-;NOBYPASS-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
deleted file mode 100644
index a8421d9506a87..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !9
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
deleted file mode 100644
index 935b707ff1072..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
+++ /dev/null
@@ -1,4 +0,0 @@
-caller:0:0
- 2: sum:0
-  3: 0 __prefetch_nta_0:23456
-  3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll b/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
deleted file mode 100644
index 05f542799c08b..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-inline.afdo | FileCheck %s
-;
-; Verify we can insert prefetch instructions in code belonging to inlined
-; functions.
-;
-; ModuleID = 'test.cc'
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @sum(ptr nocapture readonly %arr, i32 %pos1, i32 %pos2) local_unnamed_addr #0 !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !10
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !10
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !10, !tbaa !11
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !15
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !15
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !15, !tbaa !11
-  %add = add nsw i32 %1, %0, !dbg !16
-  ret i32 %add, !dbg !17
-}
-
-; "caller" inlines "sum". The associated .afdo file references instructions
-; in "caller" that came from "sum"'s inlining.
-;
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @caller(ptr nocapture readonly %arr) local_unnamed_addr #0 !dbg !18 {
-entry:
-  %0 = load i32, ptr %arr, align 4, !dbg !19, !tbaa !11
-  %arrayidx2.i = getelementptr inbounds i32, ptr %arr, i64 2, !dbg !21
-  %1 = load i32, ptr %arrayidx2.i, align 4, !dbg !21, !tbaa !11
-  %add.i = add nsw i32 %1, %0, !dbg !22
-  ret i32 %add.i, !dbg !23
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !8, file: !8, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DIFile(filename: "./test.h", directory: "/tmp")
-!9 = !DISubroutineType(types: !2)
-!10 = !DILocation(line: 6, column: 10, scope: !7)
-!11 = !{!12, !12, i64 0}
-!12 = !{!"int", !13, i64 0}
-!13 = !{!"omnipotent char", !14, i64 0}
-!14 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 6, column: 22, scope: !7)
-!16 = !DILocation(line: 6, column: 20, scope: !7)
-!17 = !DILocation(line: 6, column: 3, scope: !7)
-!18 = distinct !DISubprogram(name: "caller", linkageName: "caller", scope: !1, file: !1, line: 4, type: !9, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!19 = !DILocation(line: 6, column: 10, scope: !7, inlinedAt: !20)
-!20 = distinct !DILocation(line: 6, column: 10, scope: !18)
-!21 = !DILocation(line: 6, column: 22, scope: !7, inlinedAt: !20)
-!22 = !DILocation(line: 6, column: 20, scope: !7, inlinedAt: !20)
-!23 = !DILocation(line: 6, column: 3, scope: !18)
-
-; CHECK-LABEL: caller:
-; CHECK-LABEL: # %bb.0:
-; CHECK-NEXT: .loc 1 6 22 prologue_end
-; CHECK-NEXT: prefetchnta 23464(%rdi)
-; CHECK-NEXT: movl 8(%rdi), %eax
-; CHECK-NEXT: .loc 1 6 20 is_stmt 0 discriminator 2
-; CHECK-NEXT: prefetchnta 8764(%rdi)
-; CHECK-NEXT: prefetchnta 64(%rdi)
-; CHECK-NEXT: addl (%rdi), %eax
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
deleted file mode 100644
index 6385a498b8f92..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
+++ /dev/null
@@ -1,2 +0,0 @@
-main:0:0
- 6: 0 __prefetch_nta_0:42
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
deleted file mode 100644
index f8e25028cfdee..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-invalid-instr.afdo | FileCheck %s
-; ModuleID = 'prefetch.cc'
-source_filename = "prefetch.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
-entry:
-  tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9
-  ret i32 291, !dbg !11
-}
-
-; Function Attrs: inaccessiblemem_or_argmemonly nounwind
-declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1
-
-attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"}
-attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
-attributes #2 = { argmemonly nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "prefetch.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 327078) (llvm/trunk 327086)"}
-!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 12, column: 3, scope: !7)
-!10 = !DILocation(line: 14, column: 3, scope: !7)
-!11 = !DILocation(line: 15, column: 3, scope: !7)
-
-;CHECK-LABEL: main:
-;CHECK:       # %bb.0:
-;CHECK:       prefetchnta 291
-;CHECK-NOT:   prefetchnta 42(%rax,%ymm0)
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo b/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
deleted file mode 100644
index 783da34f7f84c..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_t0_1:0 __prefetch_t2_0:42
- 1.1: 0 __prefetch_t1_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.afdo b/llvm/test/CodeGen/X86/insert-prefetch.afdo
deleted file mode 100644
index 96487e85eaaf2..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_nta_1:0 __prefetch_nta_0:42
- 1.1: 0 __prefetch_nta_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.ll b/llvm/test/CodeGen/X86/insert-prefetch.ll
deleted file mode 100644
index 971a6193862d0..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch.afdo | FileCheck %s
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-other.afdo | FileCheck %s -check-prefix=OTHERS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; NOTE: debug line numbers were adjusted such that the function would start
-; at line 15 (an arbitrary number). The sample profile file format uses
-; offsets from the start of the symbol instead of file-relative line numbers.
-; The .afdo file reflects that - the instructions are offset '1'.
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !35 !prof !37 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !38
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !38
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !38, !tbaa !39
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !43
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !43
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !43, !tbaa !39
-  %add = add nsw i32 %1, %0, !dbg !44
-  ret i32 %add, !dbg !45
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!33}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{i32 1, !"ProfileSummary", !7}
-!7 = !{!8, !9, !10, !11, !12, !13, !14, !15}
-!8 = !{!"ProfileFormat", !"SampleProfile"}
-!9 = !{!"TotalCount", i64 0}
-!10 = !{!"MaxCount", i64 0}
-!11 = !{!"MaxInternalCount", i64 0}
-!12 = !{!"MaxFunctionCount", i64 0}
-!13 = !{!"NumCounts", i64 2}
-!14 = !{!"NumFunctions", i64 1}
-!15 = !{!"DetailedSummary", !16}
-!16 = !{!17, !18, !19, !20, !21, !22, !22, !23, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
-!17 = !{i32 10000, i64 0, i32 0}
-!18 = !{i32 100000, i64 0, i32 0}
-!19 = !{i32 200000, i64 0, i32 0}
-!20 = !{i32 300000, i64 0, i32 0}
-!21 = !{i32 400000, i64 0, i32 0}
-!22 = !{i32 500000, i64 0, i32 0}
-!23 = !{i32 600000, i64 0, i32 0}
-!24 = !{i32 700000, i64 0, i32 0}
-!25 = !{i32 800000, i64 0, i32 0}
-!26 = !{i32 900000, i64 0, i32 0}
-!27 = !{i32 950000, i64 0, i32 0}
-!28 = !{i32 990000, i64 0, i32 0}
-!29 = !{i32 999000, i64 0, i32 0}
-!30 = !{i32 999900, i64 0, i32 0}
-!31 = !{i32 999990, i64 0, i32 0}
-!32 = !{i32 999999, i64 0, i32 0}
-!33 = !{!"clang version 7.0.0 (trunk 322593) (llvm/trunk 322526)"}
-!35 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 15, type: !36, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!36 = !DISubroutineType(types: !2)
-!37 = !{!"function_entry_count", i64 -1}
-!38 = !DILocation(line: 16, column: 10, scope: !35)
-!39 = !{!40, !40, i64 0}
-!40 = !{!"int", !41, i64 0}
-!41 = !{!"omnipotent char", !42, i64 0}
-!42 = !{!"Simple C++ TBAA"}
-!43 = !DILocation(line: 16, column: 22, scope: !35)
-!44 = !DILocation(line: 16, column: 20, scope: !35)
-!45 = !DILocation(line: 16, column: 3, scope: !35)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       prefetchnta 42(%rdi,%rax,4)
-;CHECK-NEXT:  prefetchnta (%rdi,%rax,4)
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
-;CHECK-NEXT:  prefetchnta -1(%rdi,%rcx,4)
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 16 3                   # test.cc:16:3
-
-;OTHERS-LABEL: sum:
-;OTHERS:       # %bb.0:
-;OTHERS:       prefetcht2 42(%rdi,%rax,4)
-;OTHERS-NEXT:  prefetcht0 (%rdi,%rax,4)
-;OTHERS-NEXT:  movl (%rdi,%rax,4), %eax
-;OTHERS-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
-;OTHERS-NEXT:  prefetcht1 -1(%rdi,%rcx,4)
-;OTHERS-NEXT:  addl (%rdi,%rcx,4), %eax
-;OTHERS-NEXT:  .loc 1 16 3                   # test.cc:16:3
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 81390e59d0d0a..276232e27c000 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -208,8 +208,6 @@
 ; CHECK-NEXT:       X86 Fixup Inst Tuning
 ; CHECK-NEXT:       X86 Fixup Vector Constants
 ; CHECK-NEXT:       Compressing EVEX instrs when possible
-; CHECK-NEXT:       X86 Discriminate Memory Operands
-; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       Remove Loads Into Fake Uses

From 472e4ab0b02d3dec001f885beb535c9d727d1ea2 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 10:07:48 -0800
Subject: [PATCH 058/105] [MLGO] Fully Remove MLRegalloc Experimental Features
 (#168252)

20a22a45e96bc94c3a8295cccc9031bd87552725 was supposed to fully remove
these, but left around the functionality to actually compute them and a
unittest that ensured they worked. These are not development features in
the sense of features used in development mode, but experimental
features that have been superseded by MIR2Vec.
---
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp   | 137 --------
 llvm/unittests/CodeGen/CMakeLists.txt         |   1 -
 .../CodeGen/MLRegAllocDevelopmentFeatures.cpp | 293 ------------------
 3 files changed, 431 deletions(-)
 delete mode 100644 llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp

diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 32b6c46303828..34531dd7ab17f 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -133,10 +133,6 @@ INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
 // Common ML Advisor declarations
 // ===================================
 namespace {
-// The model can only accept a specified number of opcodes and will error it if
-// fed an opcode it hasn't seen before. This constant sets the current cutoff.
-static const int OpcodeValueCutoff = 17716;
-
 // Most features are as described above, so we'll reuse this vector in defining
 // them.
 static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
@@ -948,139 +944,6 @@ void MLEvictAdvisor::extractFeatures(
 #undef SET
 }
 
-void llvm::extractInstructionFeatures(
-    SmallVectorImpl<LRStartEndInfo> &LRPosInfo, MLModelRunner *RegallocRunner,
-    function_ref<int(SlotIndex)> GetOpcode,
-    function_ref<float(SlotIndex)> GetMBBFreq,
-    function_ref<MachineBasicBlock *(SlotIndex)> GetMBBReference,
-    const int InstructionsIndex, const int InstructionsMappingIndex,
-    const int MBBFreqIndex, const int MBBMappingIndex,
-    const SlotIndex LastIndex) {
-  // This function extracts instruction based features relevant to the eviction
-  // problem currently being solved. This function ends up extracting two
-  // tensors.
-  // 1 - A vector of size max instruction count. It contains the opcodes of the
-  // instructions spanned by all the intervals in the current instance of the
-  // eviction problem.
-  // 2 - A binary mapping matrix of size (LR count * max
-  // instruction count) which maps where the LRs are live to the actual opcodes
-  // for which they are live.
-  // 3 - A vector of size max supported MBB count storing MBB frequencies,
-  // encompassing all of the MBBs covered by the eviction problem.
-  // 4 - A vector of size max instruction count of indices to members of the MBB
-  // frequency vector, mapping each instruction to its associated MBB.
-
-  // Start off by sorting the segments based on the beginning slot index.
-  std::sort(
-      LRPosInfo.begin(), LRPosInfo.end(),
-      [](LRStartEndInfo A, LRStartEndInfo B) { return A.Begin < B.Begin; });
-  size_t InstructionIndex = 0;
-  size_t CurrentSegmentIndex = 0;
-  SlotIndex CurrentIndex = LRPosInfo[0].Begin;
-  std::map<MachineBasicBlock *, size_t> VisitedMBBs;
-  size_t CurrentMBBIndex = 0;
-  // This loop processes all the segments sequentially by starting at the
-  // beginning slot index of the first segment, iterating through all the slot
-  // indices before the end slot index of that segment (while checking for
-  // overlaps with segments that start at greater slot indices). After hitting
-  // that end index, the current segment being processed gets bumped until they
-  // are all processed or the max instruction count is hit, where everything is
-  // just truncated.
-  while (true) {
-    // If the index that we are currently at is within the current segment and
-    // we haven't hit the max instruction count, continue processing the current
-    // segment.
-    while (CurrentIndex <= LRPosInfo[CurrentSegmentIndex].End &&
-           InstructionIndex < ModelMaxSupportedInstructionCount) {
-      int CurrentOpcode = GetOpcode(CurrentIndex);
-      // If the current machine instruction is null, skip it
-      if (CurrentOpcode == -1) {
-        // If we're currently at the last index in the SlotIndex analysis,
-        // we can't go any further, so return from the function
-        if (CurrentIndex >= LastIndex) {
-          return;
-        }
-        CurrentIndex = CurrentIndex.getNextIndex();
-        continue;
-      }
-      MachineBasicBlock *CurrentMBBReference = GetMBBReference(CurrentIndex);
-      if (VisitedMBBs.count(CurrentMBBReference) == 0) {
-        VisitedMBBs[CurrentMBBReference] = CurrentMBBIndex;
-        ++CurrentMBBIndex;
-      }
-      extractMBBFrequency(CurrentIndex, InstructionIndex, VisitedMBBs,
-                          GetMBBFreq, CurrentMBBReference, RegallocRunner,
-                          MBBFreqIndex, MBBMappingIndex);
-      // Current code assumes we're not going to get any disjointed segments
-      assert(LRPosInfo[CurrentSegmentIndex].Begin <= CurrentIndex);
-      RegallocRunner->getTensor<int64_t>(InstructionsIndex)[InstructionIndex] =
-          CurrentOpcode < OpcodeValueCutoff ? CurrentOpcode : 0;
-      // set value in the binary mapping matrix for the current instruction
-      auto CurrentSegmentPosition = LRPosInfo[CurrentSegmentIndex].Pos;
-      RegallocRunner->getTensor<int64_t>(
-          InstructionsMappingIndex)[CurrentSegmentPosition *
-                                        ModelMaxSupportedInstructionCount +
-                                    InstructionIndex] = 1;
-      // All of the segments are sorted based on the beginning slot index, but
-      // this doesn't mean that the beginning slot index of the next segment is
-      // after the end segment of the one being currently processed. This while
-      // loop checks for overlapping segments and modifies the portion of the
-      // column in the mapping matrix for the currently processed instruction
-      // for the LR it is checking. Also make sure that the beginning of the
-      // current segment we're checking for overlap in is less than the current
-      // index, otherwise we're done checking overlaps.
-      size_t OverlapCheckCurrentSegment = CurrentSegmentIndex + 1;
-      while (OverlapCheckCurrentSegment < LRPosInfo.size() &&
-             LRPosInfo[OverlapCheckCurrentSegment].Begin <= CurrentIndex) {
-        auto OverlapCurrentSegmentPosition =
-            LRPosInfo[OverlapCheckCurrentSegment].Pos;
-        if (LRPosInfo[OverlapCheckCurrentSegment].End >= CurrentIndex) {
-          RegallocRunner->getTensor<int64_t>(
-              InstructionsMappingIndex)[OverlapCurrentSegmentPosition *
-                                            ModelMaxSupportedInstructionCount +
-                                        InstructionIndex] = 1;
-        }
-        ++OverlapCheckCurrentSegment;
-      }
-      ++InstructionIndex;
-      if (CurrentIndex >= LastIndex) {
-        return;
-      }
-      CurrentIndex = CurrentIndex.getNextIndex();
-    }
-    // if we've just finished processing through the last segment or if we've
-    // hit the maximum number of instructions, break out of the loop.
-    if (CurrentSegmentIndex == LRPosInfo.size() - 1 ||
-        InstructionIndex >= ModelMaxSupportedInstructionCount) {
-      break;
-    }
-    // If the segments are not overlapping, we need to move to the beginning
-    // index of the next segment to avoid having instructions not attached to
-    // any register.
-    if (LRPosInfo[CurrentSegmentIndex + 1].Begin >
-        LRPosInfo[CurrentSegmentIndex].End) {
-      CurrentIndex = LRPosInfo[CurrentSegmentIndex + 1].Begin;
-    }
-    ++CurrentSegmentIndex;
-  }
-}
-
-void llvm::extractMBBFrequency(
-    const SlotIndex CurrentIndex, const size_t CurrentInstructionIndex,
-    std::map<MachineBasicBlock *, size_t> &VisitedMBBs,
-    function_ref<float(SlotIndex)> GetMBBFreq,
-    MachineBasicBlock *CurrentMBBReference, MLModelRunner *RegallocRunner,
-    const int MBBFreqIndex, const int MBBMappingIndex) {
-  size_t CurrentMBBIndex = VisitedMBBs[CurrentMBBReference];
-  float CurrentMBBFreq = GetMBBFreq(CurrentIndex);
-  if (CurrentMBBIndex < ModelMaxSupportedMBBCount) {
-    RegallocRunner->getTensor<float>(MBBFreqIndex)[CurrentMBBIndex] =
-        CurrentMBBFreq;
-    RegallocRunner->getTensor<int64_t>(
-        MBBMappingIndex)[CurrentInstructionIndex] = CurrentMBBIndex;
-  }
-}
-
 // Development mode-specific implementations
 #ifdef LLVM_HAVE_TFLITE
 
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 4d07462babefa..80d10138d7bfe 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -49,7 +49,6 @@ add_llvm_unittest(CodeGenTests
   TypeTraitsTest.cpp
   TargetOptionsTest.cpp
   TestAsmPrinter.cpp
-  MLRegAllocDevelopmentFeatures.cpp
   X86MCInstLowerTest.cpp
   )
 
diff --git a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp b/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
deleted file mode 100644
index 00c2c3abf8533..0000000000000
--- a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- MLRegAllocDevelopmentFeatures.cpp - test dev MLRegAlloc features ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "../../lib/CodeGen/MLRegAllocEvictAdvisor.h"
-#include "llvm/Analysis/NoInferenceModelRunner.h"
-#include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/TargetParser/Triple.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using testing::ContainerEq;
-using testing::Test;
-
-namespace {
-
-#include "MFCommon.inc"
-
-struct LRPosInfoIndexes {
-  size_t StartIndex;
-  size_t EndIndex;
-  size_t PhysReg;
-};
-
-class RegAllocDevelopmentFeaturesTest : public ::Test {
-protected:
-  SmallVector<LRStartEndInfo>
-  setupOverlapProblem(const SmallVectorImpl<LRPosInfoIndexes> &Segments,
-                      simple_ilist<IndexListEntry> &IndexList) {
-    SmallVector<LRStartEndInfo> PositionsToReturn;
-    PositionsToReturn.reserve(Segments.size());
-    for (auto CurrentPosIndexInfo : Segments) {
-      LRStartEndInfo CurrentPosInfo = {};
-      CurrentPosInfo.Pos = CurrentPosIndexInfo.PhysReg;
-      PositionsToReturn.push_back(CurrentPosInfo);
-    }
-    size_t CurrentSegmentIndex = 0;
-    size_t CurrentIndex = 0;
-    while (CurrentSegmentIndex < Segments.size()) {
-      auto *CurrentLEMem = static_cast<IndexListEntry *>(
-          Allocator.Allocate(sizeof(IndexListEntry), alignof(IndexListEntry)));
-      auto *CurrentListEntry =
-          new (CurrentLEMem) IndexListEntry(nullptr, CurrentIndex);
-      IndexList.push_back(*CurrentListEntry);
-      for (size_t CurrentPosInfoIndex = 0;
-           CurrentPosInfoIndex < Segments.size(); ++CurrentPosInfoIndex) {
-        if ((CurrentIndex / SlotIndex::InstrDist) ==
-            Segments[CurrentPosInfoIndex].StartIndex) {
-          PositionsToReturn[CurrentPosInfoIndex].Begin =
-              SlotIndex(CurrentListEntry, 0);
-        } else if ((CurrentIndex / SlotIndex::InstrDist) ==
-                   Segments[CurrentPosInfoIndex].EndIndex) {
-          PositionsToReturn[CurrentPosInfoIndex].End =
-              SlotIndex(CurrentListEntry, 0);
-          ++CurrentSegmentIndex;
-        }
-      }
-      CurrentIndex += SlotIndex::InstrDist;
-    }
-    return PositionsToReturn;
-  }
-
-  NoInferenceModelRunner setupModelRunner() {
-    const std::vector<TensorSpec> Inputs{
-        TensorSpec::createSpec<int64_t>("instructions", InstructionsShape),
-        TensorSpec::createSpec<int64_t>("instructions_mapping",
-                                        InstructionsMappingShape),
-        TensorSpec::createSpec<float>("mbb_frequencies", MBBFrequencyShape),
-        TensorSpec::createSpec<int64_t>("mbb_mapping", InstructionsShape)};
-    LLVMContext Ctx;
-    return NoInferenceModelRunner(Ctx, Inputs);
-  }
-
-  std::vector<int64_t>
-  getExpectedMappingMatrix(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    std::vector<int64_t> ExpectedMappingMatrix(
-        NumberOfInterferences * ModelMaxSupportedInstructionCount, 0);
-    for (auto NewSegment : OverlapSetup) {
-      for (size_t CurrentIndex = NewSegment.StartIndex;
-           CurrentIndex <= NewSegment.EndIndex; ++CurrentIndex) {
-        ExpectedMappingMatrix[NewSegment.PhysReg *
-                                  ModelMaxSupportedInstructionCount +
-                              CurrentIndex] = 1;
-      }
-    }
-    return ExpectedMappingMatrix;
-  }
-
-  void runOverlapTest(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    simple_ilist<IndexListEntry> IndexList;
-    auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-    NoInferenceModelRunner ModelRunner = setupModelRunner();
-    size_t MaxIndex = 0;
-    for (size_t CurrentOverlap = 0; CurrentOverlap < OverlapSetup.size();
-         ++CurrentOverlap) {
-      if (OverlapSetup[CurrentOverlap].EndIndex >
-          OverlapSetup[MaxIndex].EndIndex) {
-        MaxIndex = CurrentOverlap;
-      }
-    }
-    SlotIndex LastIndex = OverlapProblem[MaxIndex].End;
-    extractInstructionFeatures(
-        OverlapProblem, &ModelRunner,
-        [](SlotIndex InputSlot) -> int { return 0; },
-        [](SlotIndex InputSlot) -> float { return 0.0f; },
-        [](SlotIndex InputSlot) -> MachineBasicBlock * { return nullptr; }, 0,
-        1, 2, 3, LastIndex);
-    std::vector<int64_t> MappingMatrix(
-        ModelRunner.getTensor<int64_t>(1),
-        ModelRunner.getTensor<int64_t>(1) +
-            NumberOfInterferences * ModelMaxSupportedInstructionCount);
-    ASSERT_THAT(MappingMatrix,
-                ContainerEq(getExpectedMappingMatrix(OverlapSetup)));
-    IndexList.clear();
-  }
-
-  BumpPtrAllocator Allocator;
-};
-
-// meta tests to ensure that test setup works correctly
-
-TEST_F(RegAllocDevelopmentFeaturesTest,
-       MetaOverlapInstructionDistancesAreCorrect) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 5, 0});
-  OverlapSetup.push_back({5, 10, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].End),
-            5 * SlotIndex::InstrDist);
-  ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].Begin), 0);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, MetaSlotIndicesAreValid) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, 10, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  ASSERT_TRUE(OverlapProblem[0].Begin.isValid());
-  ASSERT_TRUE(OverlapProblem[0].End.isValid());
-}
-
-// Testing of feature extraction for per-instruction features
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InstructionOpcodesAreCorrect) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex LastIndex = OverlapProblem[0].End;
-  SlotIndex FirstIndex = OverlapProblem[0].Begin;
-  extractInstructionFeatures(
-      OverlapProblem, &ModelRunner,
-      [FirstIndex](SlotIndex InputSlot) -> int {
-        return FirstIndex.distance(InputSlot) / SlotIndex::InstrDist;
-      },
-      [](SlotIndex InputSlot) -> float { return 0.0f; },
-      [](SlotIndex InputSlot) -> MachineBasicBlock * { return nullptr; }, 0, 1,
-      2, 3, LastIndex);
-  for (size_t CurrentInstructionIndex = 0;
-       CurrentInstructionIndex < ModelMaxSupportedInstructionCount;
-       ++CurrentInstructionIndex) {
-    ASSERT_EQ(
-        (size_t)ModelRunner.getTensor<int64_t>(0)[CurrentInstructionIndex],
-        CurrentInstructionIndex);
-  }
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, FullOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, PartialOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 20, 0});
-  OverlapSetup.push_back({15, 30, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, PartialOverlapOpposite) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({15, 30, 1});
-  OverlapSetup.push_back({0, 20, 0});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InternalOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 30, 0});
-  OverlapSetup.push_back({10, 20, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, TripleInternalOverlap) {
-  SmallVector<LRPosInfoIndexes, 3> OverlapSetup;
-  OverlapSetup.push_back({0, 30, 0});
-  OverlapSetup.push_back({10, 25, 1});
-  OverlapSetup.push_back({15, 20, 2});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InternalMultiOverlap) {
-  SmallVector<LRPosInfoIndexes, 3> OverlapSetup;
-  OverlapSetup.push_back({0, 45, 0});
-  OverlapSetup.push_back({30, 40, 1});
-  OverlapSetup.push_back({35, 60, 2});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, SingleMBBTest) {
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex CurrentIndex;
-  // set index to 1 so we can ensure that the mapping actually get set
-  std::map<MachineBasicBlock *, size_t> VisitedMBBs = {{nullptr, 1}};
-  extractMBBFrequency(
-      CurrentIndex, 0, VisitedMBBs,
-      [](SlotIndex InputSlot) -> float { return 1.0f; }, nullptr, &ModelRunner,
-      2, 3);
-  ASSERT_FLOAT_EQ(ModelRunner.getTensor<float>(2)[1], 1.0f);
-  ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[0], 1);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, MBBFullTruncated) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex LastIndex = OverlapProblem[0].End;
-  SlotIndex FirstIndex = OverlapProblem[0].Begin;
-
-  LLVMContext Ctx;
-  Module Mod("Module", Ctx);
-  auto MF = createMachineFunction(Ctx, Mod);
-  std::array<MachineBasicBlock *, ModelMaxSupportedInstructionCount>
-      MBBsForTest;
-  for (size_t I = 0; I < ModelMaxSupportedInstructionCount; ++I) {
-    MBBsForTest[I] = MF->CreateMachineBasicBlock();
-  }
-
-  extractInstructionFeatures(
-      OverlapProblem, &ModelRunner,
-      [](SlotIndex InputSlot) -> int { return 0; },
-      [FirstIndex](SlotIndex InputSlot) -> float {
-        return static_cast<float>(FirstIndex.distance(InputSlot) /
-                                  SlotIndex::InstrDist);
-      },
-      [FirstIndex, MBBsForTest](SlotIndex InputSlot) -> MachineBasicBlock * {
-        return MBBsForTest[FirstIndex.distance(InputSlot) /
-                           SlotIndex::InstrDist];
-      },
-      0, 1, 2, 3, LastIndex);
-  for (size_t MBBIndex = 0; MBBIndex < ModelMaxSupportedMBBCount; ++MBBIndex) {
-    ASSERT_FLOAT_EQ(ModelRunner.getTensor<float>(2)[MBBIndex],
-                    static_cast<float>(MBBIndex));
-    ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[MBBIndex],
-              static_cast<int64_t>(MBBIndex));
-  }
-  // the rest of the mapping values should be zero (truncated to 100 MBBs)
-  for (size_t MBBIndex = ModelMaxSupportedMBBCount;
-       MBBIndex < ModelMaxSupportedInstructionCount; ++MBBIndex) {
-    ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[MBBIndex],
-              static_cast<int64_t>(0));
-  }
-}
-
-} // end namespace

From 4be0ab659e6a65436c4e3629706318acd0c1cdc9 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 17 Nov 2025 12:23:02 -0600
Subject: [PATCH 059/105] [flang][OpenMP] Undeprecate accidentally deprecated
 TARGET LOOP (#167495)

---
 flang/lib/Semantics/resolve-directives.cpp             |  7 +++----
 .../test/Semantics/OpenMP/target-loop-still-there.f90  | 10 ++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/target-loop-still-there.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 68d007bc2de7e..c4d103613b587 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2038,8 +2038,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
   if (beginName.v == llvm::omp::OMPD_master_taskloop ||
       beginName.v == llvm::omp::OMPD_master_taskloop_simd ||
       beginName.v == llvm::omp::OMPD_parallel_master_taskloop ||
-      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd ||
-      beginName.v == llvm::omp::Directive::OMPD_target_loop) {
+      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd) {
     unsigned version{context_.langOptions().OpenMPVersion};
     IssueNonConformanceWarning(beginName.v, beginName.source, version);
   }
@@ -3622,8 +3621,8 @@ void OmpAttributeVisitor::IssueNonConformanceWarning(llvm::omp::Directive D,
   case llvm::omp::OMPD_allocate:
     setAlternativeStr("ALLOCATORS");
     break;
-  case llvm::omp::OMPD_target_loop:
-  default:;
+  default:
+    break;
   }
   context_.Warn(common::UsageWarning::OpenMPUsage, source, "%s"_warn_en_US,
       warnStrOS.str());
diff --git a/flang/test/Semantics/OpenMP/target-loop-still-there.f90 b/flang/test/Semantics/OpenMP/target-loop-still-there.f90
new file mode 100644
index 0000000000000..2d3b1820e23d4
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/target-loop-still-there.f90
@@ -0,0 +1,10 @@
+!RUN: %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=60 -Werror %s | FileCheck --allow-empty %s
+
+!CHECK-NOT: deprecated
+subroutine f00
+  implicit none
+  integer :: i
+  !$omp target loop
+  do i = 1, 10
+  end do
+end

From c4be17a8877ba406bcda63c5398bc09ebb32598a Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Mon, 17 Nov 2025 10:28:26 -0800
Subject: [PATCH 060/105] [compiler-rt] [libFuzzer] Fix merge-posix.test file
 size test (#168137)

This test uses `ulimit -f 1` to test what libFuzzer does when trying to
create a file > **_1KB_**. However, the none of the input files used by
this test are actually >= 1KB, so there's no reason to expect this test
to pass.

This test appears to be passing on accident since the "control file"
happens to be > 1KB, but this is not always the case depending upon the
length of the path where the test is run from.

This modifies the test to ensure that one of the input file is actually
>1KB.
---
 compiler-rt/test/fuzzer/merge-posix.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/fuzzer/merge-posix.test b/compiler-rt/test/fuzzer/merge-posix.test
index 2721668fb9706..5e342142216f8 100644
--- a/compiler-rt/test/fuzzer/merge-posix.test
+++ b/compiler-rt/test/fuzzer/merge-posix.test
@@ -14,7 +14,7 @@ RUN: echo ....U. > %tmp/T2/2
 RUN: echo ...Z.. > %tmp/T2/3
 RUN: echo ...Z.. > %tmp/T2/4
 RUN: echo ....E. > %tmp/T2/5
-RUN: echo .....R > %tmp/T2/6
+RUN: %python -c "print('.....R' + 'X' * 1024, end='')" > %tmp/T2/6
 
 # Check that we can report an error if file size exceeded
 RUN: (ulimit -f 1; not %run %t-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)

From cc304e5a5cf43d454d597eb9108f0bc7e6605722 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Mon, 17 Nov 2025 18:39:41 +0000
Subject: [PATCH 061/105] [TableGen] Strip directories from filename prefixes.
 (#168355)

Fixes https://github.com/llvm/llvm-project/pull/167700 to support
builds where TableGen's output file is specified as full path
rather than just filename.
---
 llvm/lib/TableGen/Main.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index c3869c3fb9a5a..3330b70cdc2e1 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -167,12 +167,11 @@ int llvm::TableGenMain(const char *argv0,
 
   // Write output to memory.
   Timer.startBackendTimer("Backend overall");
-  SmallString<128> FilenamePrefix(OutputFilename);
-  sys::path::replace_extension(FilenamePrefix, "");
   TableGenOutputFiles OutFiles;
   unsigned status = 0;
   // ApplyCallback will return true if it did not apply any callback. In that
   // case, attempt to apply the MainFn.
+  StringRef FilenamePrefix(sys::path::stem(OutputFilename));
   if (TableGen::Emitter::ApplyCallback(Records, OutFiles, FilenamePrefix))
     status = MainFn ? MainFn(OutFiles, Records) : 1;
   Timer.stopBackendTimer();
@@ -195,7 +194,7 @@ int llvm::TableGenMain(const char *argv0,
     SmallString<128> Filename(OutputFilename);
     // TODO: Format using the split-file convention when writing to stdout?
     if (Filename != "-") {
-      Filename = FilenamePrefix;
+      sys::path::replace_extension(Filename, "");
       Filename.append(Suffix);
     }
     if (int Ret = WriteOutput(Parser, argv0, Filename, Content))

From aa4de7b4ef510427b5997e525feb642fc0c51053 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 17 Nov 2025 10:41:05 -0800
Subject: [PATCH 062/105] [bazel] Add support for multiple tblgen outputs
 (#168158)

Required after https://github.com/llvm/llvm-project/pull/167700

This adds yet another format for `tbl_outs` where you pass the list of
opts, and a list of outputs (where previously you could only have 1
output). In that case all outputs must be produced, but the first is
used for the `-o` arg since tblgen is generating the other names based
on that single argument.
---
 .../llvm-project-overlay/llvm/BUILD.bazel     | 1506 +++++++++++++----
 .../llvm-project-overlay/mlir/tblgen.bzl      |   23 +-
 2 files changed, 1187 insertions(+), 342 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 67c397e34b8c7..a94442af376e5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2193,92 +2193,249 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "AArch64",
         "short_name": "AArch64",
-        "tbl_outs": {
-            "lib/Target/AArch64/AArch64GenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/AArch64/AArch64GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AArch64/AArch64GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AArch64/AArch64GenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AArch64/AArch64GenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/AArch64/AArch64GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AArch64/AArch64GenAsmWriter1.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/AArch64/AArch64GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AArch64/AArch64GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AArch64/AArch64GenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/AArch64/AArch64GenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/AArch64/AArch64GenO0PreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64O0PreLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PreLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PostLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PostLegalizerLowering",
-            ],
-            "lib/Target/AArch64/AArch64GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AArch64/AArch64GenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/AArch64/AArch64GenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/AArch64/AArch64GenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/AArch64/AArch64GenSystemOperands.inc": ["-gen-searchable-tables"],
-            "lib/Target/AArch64/AArch64GenExegesis.inc": ["-gen-exegesis"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/AArch64/AArch64GenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AArch64/AArch64GenRegisterInfo.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoEnums.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoMCDesc.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoHeader.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AArch64/AArch64GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AArch64/AArch64GenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/AArch64/AArch64GenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AArch64/AArch64GenAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/AArch64/AArch64GenAsmWriter1.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AArch64/AArch64GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AArch64/AArch64GenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/AArch64/AArch64GenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/AArch64/AArch64GenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64O0PreLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenO0PreLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PreLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PostLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PostLegalizerLowering",
+                ],
+                "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AArch64/AArch64GenCallingConv.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AArch64/AArch64GenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AArch64/AArch64GenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/AArch64/AArch64GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/AArch64/AArch64GenSystemOperands.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/AArch64/AArch64GenExegesis.inc",
+            ),
+        ],
     },
     {
         "name": "ARM",
         "short_name": "ARM",
-        "tbl_outs": {
-            "lib/Target/ARM/ARMGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/ARM/ARMGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/ARM/ARMGenSystemRegister.inc": ["-gen-searchable-tables"],
-            "lib/Target/ARM/ARMGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/ARM/ARMGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/ARM/ARMGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/ARM/ARMGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/ARM/ARMGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/ARM/ARMGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/ARM/ARMGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/ARM/ARMGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/ARM/ARMGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/ARM/ARMGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/ARM/ARMGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "-ignore-non-decodable-operands",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/ARM/ARMGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/ARM/ARMGenRegisterInfo.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoEnums.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoMCDesc.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoHeader.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/ARM/ARMGenSystemRegister.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/ARM/ARMGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/ARM/ARMGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/ARM/ARMGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/ARM/ARMGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/ARM/ARMGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/ARM/ARMGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/ARM/ARMGenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/ARM/ARMGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/ARM/ARMGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/ARM/ARMGenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "-ignore-non-decodable-operands",
+                ],
+                "lib/Target/ARM/ARMGenDisassemblerTables.inc",
+            ),
+        ],
     },
     {
         "name": "AMDGPU",
         "short_name": "AMDGPU",
-        "tbl_outs": {
-            "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "--specialize-decoders-per-bitwidth",
-                "-ignore-non-decodable-operands",
-                "-ignore-fully-defined-operands",
-            ],
-            "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc": ["-gen-searchable-tables"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoEnums.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoMCDesc.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoHeader.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "--specialize-decoders-per-bitwidth",
+                    "-ignore-non-decodable-operands",
+                    "-ignore-fully-defined-operands",
+                ],
+                "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc",
+            ),
+        ],
         "tbl_deps": [
             ":InstCombineTableGen",
             ":amdgpu_isel_target_gen",
@@ -2288,184 +2445,567 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "AVR",
         "short_name": "AVR",
-        "tbl_outs": {
-            "lib/Target/AVR/AVRGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AVR/AVRGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AVR/AVRGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AVR/AVRGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AVR/AVRGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/AVR/AVRGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AVR/AVRGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AVR/AVRGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AVR/AVRGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/AVR/AVRGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AVR/AVRGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AVR/AVRGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AVR/AVRGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AVR/AVRGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/AVR/AVRGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AVR/AVRGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AVR/AVRGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AVR/AVRGenRegisterInfo.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoEnums.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoMCDesc.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoHeader.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AVR/AVRGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AVR/AVRGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "BPF",
         "short_name": "BPF",
-        "tbl_outs": {
-            "lib/Target/BPF/BPFGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/BPF/BPFGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/BPF/BPFGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/BPF/BPFGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/BPF/BPFGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/BPF/BPFGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/BPF/BPFGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/BPF/BPFGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/BPF/BPFGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/BPF/BPFGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/BPF/BPFGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/BPF/BPFGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/BPF/BPFGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/BPF/BPFGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/BPF/BPFGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/BPF/BPFGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/BPF/BPFGenDAGISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/BPF/BPFGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/BPF/BPFGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/BPF/BPFGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/BPF/BPFGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/BPF/BPFGenRegisterInfo.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoEnums.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoMCDesc.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoHeader.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/BPF/BPFGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/BPF/BPFGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Hexagon",
         "short_name": "Hexagon",
-        "tbl_outs": {
-            "lib/Target/Hexagon/HexagonGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Hexagon/HexagonGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Hexagon/HexagonGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Hexagon/HexagonGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Hexagon/HexagonGenDFAPacketizer.inc": ["-gen-dfa-packetizer"],
-            "lib/Target/Hexagon/HexagonGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/Hexagon/HexagonGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Hexagon/HexagonGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Hexagon/HexagonGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Hexagon/HexagonGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Hexagon/HexagonGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Hexagon/HexagonGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Hexagon/HexagonGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Hexagon/HexagonGenDAGISel.inc",
+            ),
+            (
+                ["-gen-dfa-packetizer"],
+                "lib/Target/Hexagon/HexagonGenDFAPacketizer.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Hexagon/HexagonGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Hexagon/HexagonGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Hexagon/HexagonGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Hexagon/HexagonGenRegisterInfo.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoEnums.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoHeader.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Hexagon/HexagonGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Lanai",
         "short_name": "Lanai",
-        "tbl_outs": {
-            "lib/Target/Lanai/LanaiGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Lanai/LanaiGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Lanai/LanaiGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Lanai/LanaiGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Lanai/LanaiGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/Lanai/LanaiGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Lanai/LanaiGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Lanai/LanaiGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Lanai/LanaiGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/Lanai/LanaiGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Lanai/LanaiGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Lanai/LanaiGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Lanai/LanaiGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Lanai/LanaiGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Lanai/LanaiGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Lanai/LanaiGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Lanai/LanaiGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Lanai/LanaiGenRegisterInfo.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoEnums.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoHeader.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/Lanai/LanaiGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Lanai/LanaiGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "LoongArch",
         "short_name": "LoongArch",
-        "tbl_outs": {
-            "lib/Target/LoongArch/LoongArchGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/LoongArch/LoongArchGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/LoongArch/LoongArchGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/LoongArch/LoongArchGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/LoongArch/LoongArchGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/LoongArch/LoongArchGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/LoongArch/LoongArchGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/LoongArch/LoongArchGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/LoongArch/LoongArchGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/LoongArch/LoongArchGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/LoongArch/LoongArchGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/LoongArch/LoongArchGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/LoongArch/LoongArchGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/LoongArch/LoongArchGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/LoongArch/LoongArchGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/LoongArch/LoongArchGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfo.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoEnums.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoMCDesc.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoHeader.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/LoongArch/LoongArchGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Mips",
         "short_name": "Mips",
-        "tbl_outs": {
-            "lib/Target/Mips/MipsGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Mips/MipsGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Mips/MipsGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Mips/MipsGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Mips/MipsGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "-ignore-non-decodable-operands",
-            ],
-            "lib/Target/Mips/MipsGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Mips/MipsGenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/Mips/MipsGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/Mips/MipsGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/Mips/MipsGenPostLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=MipsPostLegalizerCombiner",
-            ],
-            "lib/Target/Mips/MipsGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Mips/MipsGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/Mips/MipsGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/Mips/MipsGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Mips/MipsGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Mips/MipsGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Mips/MipsGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Mips/MipsGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Mips/MipsGenDAGISel.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "-ignore-non-decodable-operands",
+                ],
+                "lib/Target/Mips/MipsGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Mips/MipsGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/Mips/MipsGenExegesis.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/Mips/MipsGenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/Mips/MipsGenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=MipsPostLegalizerCombiner",
+                ],
+                "lib/Target/Mips/MipsGenPostLegalizeGICombiner.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Mips/MipsGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/Mips/MipsGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/Mips/MipsGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Mips/MipsGenRegisterInfo.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoEnums.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoHeader.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Mips/MipsGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "MSP430",
         "short_name": "MSP430",
-        "tbl_outs": {
-            "lib/Target/MSP430/MSP430GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/MSP430/MSP430GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/MSP430/MSP430GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/MSP430/MSP430GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/MSP430/MSP430GenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/MSP430/MSP430GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/MSP430/MSP430GenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/MSP430/MSP430GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/MSP430/MSP430GenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/MSP430/MSP430GenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/MSP430/MSP430GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/MSP430/MSP430GenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/MSP430/MSP430GenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/MSP430/MSP430GenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/MSP430/MSP430GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/MSP430/MSP430GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/MSP430/MSP430GenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/MSP430/MSP430GenRegisterInfo.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoEnums.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoMCDesc.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoHeader.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/MSP430/MSP430GenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/MSP430/MSP430GenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "NVPTX",
         "short_name": "NVPTX",
-        "tbl_outs": {
-            "lib/Target/NVPTX/NVPTXGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/NVPTX/NVPTXGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/NVPTX/NVPTXGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/NVPTX/NVPTXGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfo.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoEnums.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoMCDesc.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoHeader.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/NVPTX/NVPTXGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/NVPTX/NVPTXGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/NVPTX/NVPTXGenDAGISel.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "PowerPC",
         "short_name": "PPC",
-        "tbl_outs": {
-            "lib/Target/PowerPC/PPCGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/PowerPC/PPCGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/PowerPC/PPCGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/PowerPC/PPCGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/PowerPC/PPCGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/PowerPC/PPCGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/PowerPC/PPCGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/PowerPC/PPCGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/PowerPC/PPCGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/PowerPC/PPCGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/PowerPC/PPCGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/PowerPC/PPCGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/PowerPC/PPCGenExegesis.inc": ["-gen-exegesis"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/PowerPC/PPCGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/PowerPC/PPCGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/PowerPC/PPCGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/PowerPC/PPCGenRegisterInfo.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoEnums.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoMCDesc.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoHeader.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/PowerPC/PPCGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/PowerPC/PPCGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/PowerPC/PPCGenFastISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/PowerPC/PPCGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/PowerPC/PPCGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/PowerPC/PPCGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/PowerPC/PPCGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/PowerPC/PPCGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/PowerPC/PPCGenExegesis.inc",
+            ),
+        ],
     },
     {
         "name": "RISCV",
         "short_name": "RISCV",
-        "tbl_outs": {
-            "lib/Target/RISCV/RISCVGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/RISCV/RISCVGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/RISCV/RISCVGenCompressInstEmitter.inc": ["-gen-compress-inst-emitter"],
-            "lib/Target/RISCV/RISCVGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/RISCV/RISCVGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "--specialize-decoders-per-bitwidth",
-            ],
-            "lib/Target/RISCV/RISCVGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/RISCV/RISCVGenMacroFusion.inc": ["-gen-macro-fusion-pred"],
-            "lib/Target/RISCV/RISCVGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/RISCV/RISCVGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/RISCV/RISCVGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/RISCV/RISCVGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/RISCV/RISCVGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/RISCV/RISCVGenSearchableTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/RISCV/RISCVGenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/RISCV/RISCVGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/RISCV/RISCVGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/RISCV/RISCVGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-compress-inst-emitter"],
+                "lib/Target/RISCV/RISCVGenCompressInstEmitter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/RISCV/RISCVGenDAGISel.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "--specialize-decoders-per-bitwidth",
+                ],
+                "lib/Target/RISCV/RISCVGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/RISCV/RISCVGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-macro-fusion-pred"],
+                "lib/Target/RISCV/RISCVGenMacroFusion.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/RISCV/RISCVGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/RISCV/RISCVGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/RISCV/RISCVGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/RISCV/RISCVGenRegisterInfo.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoEnums.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoMCDesc.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoHeader.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/RISCV/RISCVGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/RISCV/RISCVGenSearchableTables.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/RISCV/RISCVGenExegesis.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/RISCV/RISCVGenSDNodeInfo.inc",
+            ),
+        ],
         "tbl_deps": [
             ":riscv_isel_target_gen",
         ],
@@ -2473,135 +3013,396 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "Sparc",
         "short_name": "Sparc",
-        "tbl_outs": {
-            "lib/Target/Sparc/SparcGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Sparc/SparcGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Sparc/SparcGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Sparc/SparcGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Sparc/SparcGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Sparc/SparcGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Sparc/SparcGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Sparc/SparcGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/Sparc/SparcGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/Sparc/SparcGenSearchableTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/Sparc/SparcGenSDNodeInfo.inc": [
-                "-gen-sd-node-info",
-                "-sdnode-namespace=SPISD",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Sparc/SparcGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Sparc/SparcGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Sparc/SparcGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Sparc/SparcGenRegisterInfo.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoEnums.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoHeader.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Sparc/SparcGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Sparc/SparcGenDAGISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Sparc/SparcGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Sparc/SparcGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Sparc/SparcGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/Sparc/SparcGenSearchableTables.inc",
+            ),
+            (
+                [
+                    "-gen-sd-node-info",
+                    "-sdnode-namespace=SPISD",
+                ],
+                "lib/Target/Sparc/SparcGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "SPIRV",
         "short_name": "SPIRV",
-        "tbl_outs": {
-            "lib/Target/SPIRV/SPIRVGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/SPIRV/SPIRVGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/SPIRV/SPIRVGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/SPIRV/SPIRVGenPreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=SPIRVPreLegalizerCombiner",
-            ],
-            "lib/Target/SPIRV/SPIRVGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/SPIRV/SPIRVGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/SPIRV/SPIRVGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/SPIRV/SPIRVGenTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/SPIRV/SPIRVGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/SPIRV/SPIRVGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/SPIRV/SPIRVGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/SPIRV/SPIRVGenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=SPIRVPreLegalizerCombiner",
+                ],
+                "lib/Target/SPIRV/SPIRVGenPreLegalizeGICombiner.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/SPIRV/SPIRVGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/SPIRV/SPIRVGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfo.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoEnums.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoMCDesc.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoHeader.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/SPIRV/SPIRVGenTables.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/SPIRV/SPIRVGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "SystemZ",
         "short_name": "SystemZ",
-        "tbl_outs": {
-            "lib/Target/SystemZ/SystemZGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/SystemZ/SystemZGenGNUAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/SystemZ/SystemZGenHLASMAsmWriter.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/SystemZ/SystemZGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/SystemZ/SystemZGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/SystemZ/SystemZGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/SystemZ/SystemZGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/SystemZ/SystemZGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/SystemZ/SystemZGenGNUAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/SystemZ/SystemZGenHLASMAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/SystemZ/SystemZGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/SystemZ/SystemZGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/SystemZ/SystemZGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/SystemZ/SystemZGenRegisterInfo.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoEnums.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoMCDesc.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoHeader.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "VE",
         "short_name": "VE",
-        "tbl_outs": {
-            "lib/Target/VE/VEGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/VE/VEGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/VE/VEGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/VE/VEGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/VE/VEGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/VE/VEGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/VE/VEGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/VE/VEGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/VE/VEGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/VE/VEGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/VE/VEGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/VE/VEGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/VE/VEGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/VE/VEGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/VE/VEGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/VE/VEGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/VE/VEGenRegisterInfo.inc",
+                    "lib/Target/VE/VEGenRegisterInfoEnums.inc",
+                    "lib/Target/VE/VEGenRegisterInfoMCDesc.inc",
+                    "lib/Target/VE/VEGenRegisterInfoHeader.inc",
+                    "lib/Target/VE/VEGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/VE/VEGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "WebAssembly",
         "short_name": "WebAssembly",
-        "tbl_outs": {
-            "lib/Target/WebAssembly/WebAssemblyGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/WebAssembly/WebAssemblyGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/WebAssembly/WebAssemblyGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/WebAssembly/WebAssemblyGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/WebAssembly/WebAssemblyGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/WebAssembly/WebAssemblyGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-disassembler"],
+                "lib/Target/WebAssembly/WebAssemblyGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/WebAssembly/WebAssemblyGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/WebAssembly/WebAssemblyGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/WebAssembly/WebAssemblyGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/WebAssembly/WebAssemblyGenFastISel.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/WebAssembly/WebAssemblyGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoEnums.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoMCDesc.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoHeader.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "X86",
         "short_name": "X86",
-        "tbl_outs": {
-            "lib/Target/X86/X86GenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/X86/X86GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/X86/X86GenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/X86/X86GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/X86/X86GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/X86/X86GenAsmWriter1.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/X86/X86GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/X86/X86GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/X86/X86GenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/X86/X86GenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/X86/X86GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/X86/X86GenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/X86/X86GenFoldTables.inc": [
-                "-gen-x86-fold-tables",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/X86/X86GenInstrMapping.inc": ["-gen-x86-instr-mapping"],
-            "lib/Target/X86/X86GenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/X86/X86GenMnemonicTables.inc": [
-                "-gen-x86-mnemonic-tables",
-                "-asmwriternum=1",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/X86/X86GenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/X86/X86GenRegisterInfo.inc",
+                    "lib/Target/X86/X86GenRegisterInfoEnums.inc",
+                    "lib/Target/X86/X86GenRegisterInfoMCDesc.inc",
+                    "lib/Target/X86/X86GenRegisterInfoHeader.inc",
+                    "lib/Target/X86/X86GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/X86/X86GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/X86/X86GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/X86/X86GenAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenAsmWriter1.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/X86/X86GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/X86/X86GenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/X86/X86GenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/X86/X86GenGlobalISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/X86/X86GenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/X86/X86GenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-x86-fold-tables",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenFoldTables.inc",
+            ),
+            (
+                ["-gen-x86-instr-mapping"],
+                "lib/Target/X86/X86GenInstrMapping.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/X86/X86GenExegesis.inc",
+            ),
+            (
+                [
+                    "-gen-x86-mnemonic-tables",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenMnemonicTables.inc",
+            ),
+        ],
     },
     {
         "name": "XCore",
         "short_name": "XCore",
-        "tbl_outs": {
-            "lib/Target/XCore/XCoreGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/XCore/XCoreGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/XCore/XCoreGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/XCore/XCoreGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/XCore/XCoreGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/XCore/XCoreGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/XCore/XCoreGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/XCore/XCoreGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/XCore/XCoreGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/XCore/XCoreGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/XCore/XCoreGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/XCore/XCoreGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/XCore/XCoreGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/XCore/XCoreGenRegisterInfo.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoEnums.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoMCDesc.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoHeader.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/XCore/XCoreGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/XCore/XCoreGenSubtargetInfo.inc",
+            ),
+        ],
     },
 ] if lib["name"] in llvm_targets]
 
@@ -2639,16 +3440,46 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "r600_target_gen",
     strip_include_prefix = "lib/Target/AMDGPU",
-    tbl_outs = {
-        "lib/Target/AMDGPU/R600GenAsmWriter.inc": ["-gen-asm-writer"],
-        "lib/Target/AMDGPU/R600GenCallingConv.inc": ["-gen-callingconv"],
-        "lib/Target/AMDGPU/R600GenDAGISel.inc": ["-gen-dag-isel"],
-        "lib/Target/AMDGPU/R600GenDFAPacketizer.inc": ["-gen-dfa-packetizer"],
-        "lib/Target/AMDGPU/R600GenInstrInfo.inc": ["-gen-instr-info"],
-        "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc": ["-gen-emitter"],
-        "lib/Target/AMDGPU/R600GenRegisterInfo.inc": ["-gen-register-info"],
-        "lib/Target/AMDGPU/R600GenSubtargetInfo.inc": ["-gen-subtarget"],
-    },
+    tbl_outs = [
+        (
+            ["-gen-asm-writer"],
+            "lib/Target/AMDGPU/R600GenAsmWriter.inc",
+        ),
+        (
+            ["-gen-callingconv"],
+            "lib/Target/AMDGPU/R600GenCallingConv.inc",
+        ),
+        (
+            ["-gen-dag-isel"],
+            "lib/Target/AMDGPU/R600GenDAGISel.inc",
+        ),
+        (
+            ["-gen-dfa-packetizer"],
+            "lib/Target/AMDGPU/R600GenDFAPacketizer.inc",
+        ),
+        (
+            ["-gen-instr-info"],
+            "lib/Target/AMDGPU/R600GenInstrInfo.inc",
+        ),
+        (
+            ["-gen-emitter"],
+            "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc",
+        ),
+        (
+            ["-gen-register-info"],
+            [
+                "lib/Target/AMDGPU/R600GenRegisterInfo.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoEnums.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoMCDesc.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoHeader.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoTargetDesc.inc",
+            ],
+        ),
+        (
+            ["-gen-subtarget"],
+            "lib/Target/AMDGPU/R600GenSubtargetInfo.inc",
+        ),
+    ],
     tblgen = ":llvm-tblgen",
     td_file = "lib/Target/AMDGPU/R600.td",
     deps = [
@@ -3381,7 +4212,10 @@ cc_library(
 gentbl_cc_library(
     name = "LibOptionsTableGen",
     strip_include_prefix = "lib/ToolDrivers/llvm-lib",
-    tbl_outs = {"lib/ToolDrivers/llvm-lib/Options.inc": ["-gen-opt-parser-defs"]},
+    tbl_outs = [(
+        ["-gen-opt-parser-defs"],
+        "lib/ToolDrivers/llvm-lib/Options.inc",
+    )],
     tblgen = ":llvm-tblgen",
     td_file = "lib/ToolDrivers/llvm-lib/Options.td",
     deps = [":OptParserTdFiles"],
diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 35888aac37e17..d28a8854fa896 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -153,7 +153,7 @@ def _gentbl_rule_impl(ctx):
     args.add("-o", ctx.outputs.out)
 
     ctx.actions.run(
-        outputs = [ctx.outputs.out],
+        outputs = [ctx.outputs.out] + ctx.outputs.additional_outputs,
         inputs = trans_srcs,
         executable = ctx.executable.tblgen,
         execution_requirements = {"supports-path-mapping": "1"},
@@ -195,6 +195,9 @@ gentbl_rule = rule(
             doc = "The output file for the TableGen invocation.",
             mandatory = True,
         ),
+        "additional_outputs": attr.output_list(
+            doc = "Extra output files from the TableGen invocation. The primary 'out' is used for the -o argument.",
+        ),
         "opts": attr.string_list(
             doc = "Additional command line options to add to the TableGen" +
                   " invocation. For include arguments, prefer to use" +
@@ -313,9 +316,12 @@ def gentbl_filegroup(
       name: The name of the generated filegroup rule for use in dependencies.
       tblgen: The binary used to produce the output.
       td_file: The primary table definitions file.
-      tbl_outs: Either a dict {out: [opts]} or a list of tuples ([opts], out),
-        where each 'opts' is a list of options passed to tblgen, each option
-        being a string, and 'out' is the corresponding output file produced.
+      tbl_outs: Either a dict {out: [opts]}, a list of tuples ([opts], out),
+        or a list of tuples ([opts], [outs]). Each 'opts' is a list of options
+        passed to tblgen, each option being a string,
+        and 'out' is the corresponding output file produced. If 'outs' are used,
+        the first path in the list is passed to '-o' but tblgen is expected
+        to produce all listed outputs.
       td_srcs: See gentbl_rule.td_srcs
       includes: See gentbl_rule.includes
       deps: See gentbl_rule.deps
@@ -325,9 +331,14 @@ def gentbl_filegroup(
       **kwargs: Extra keyword arguments to pass to all generated rules.
     """
 
+    included_srcs = []
     if type(tbl_outs) == type({}):
         tbl_outs = [(v, k) for k, v in tbl_outs.items()]
-    for (opts, out) in tbl_outs:
+    for (opts, output_or_outputs) in tbl_outs:
+        outs = output_or_outputs if type(output_or_outputs) == type([]) else [output_or_outputs]
+        out = outs[0]
+        if not any([skip_opt in opts for skip_opt in skip_opts]):
+            included_srcs.extend(outs)
         first_opt = opts[0] if opts else ""
         rule_suffix = "_{}_{}".format(
             first_opt.replace("-", "_").replace("=", "_"),
@@ -343,6 +354,7 @@ def gentbl_filegroup(
             deps = deps,
             includes = includes,
             out = out,
+            additional_outputs = outs[1:],
             **kwargs
         )
 
@@ -364,7 +376,6 @@ def gentbl_filegroup(
                 **kwargs
             )
 
-    included_srcs = [f for (opts, f) in tbl_outs if not any([skip_opt in opts for skip_opt in skip_opts])]
     native.filegroup(
         name = name,
         srcs = included_srcs,

From 6360bbbb6890c965016a98fab8ea76551f577c1f Mon Sep 17 00:00:00 2001
From: David Tenty <daltenty@ibm.com>
Date: Mon, 17 Nov 2025 13:46:37 -0500
Subject: [PATCH 063/105] [AIX] Raise soft memory limits to hard limits
 (#167928)

AIX out-of-box memory soft limits are often
insufficient to run LLVM on reasonably size
inputs. Thus, we often encounter users who run
into spurious out of memory errors.

This change raises the memory soft limits
to the hard limits at LLVM startup to prevent
these types of issues.
---
 llvm/lib/Support/InitLLVM.cpp | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index b8fbfd21c4f28..b90f4e0714458 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -32,6 +32,34 @@
 #endif
 #endif
 
+static void RaiseLimits() {
+#ifdef _AIX
+  // AIX has restrictive memory soft-limits out-of-box, so raise them if needed.
+  auto RaiseLimit = [](int resource) {
+    struct rlimit r;
+    getrlimit(resource, &r);
+
+    // Increase the soft limit to the hard limit, if necessary and
+    // possible.
+    if (r.rlim_cur != RLIM_INFINITY && r.rlim_cur != r.rlim_max) {
+      r.rlim_cur = r.rlim_max;
+      setrlimit(resource, &r);
+    }
+  };
+
+  // Address space size.
+  RaiseLimit(RLIMIT_AS);
+  // Heap size.
+  RaiseLimit(RLIMIT_DATA);
+  // Stack size.
+  RaiseLimit(RLIMIT_STACK);
+#ifdef RLIMIT_RSS
+  // Resident set size.
+  RaiseLimit(RLIMIT_RSS);
+#endif
+#endif
+}
+
 void CleanupStdHandles(void *Cookie) {
   llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs();
   Outs->flush();
@@ -67,6 +95,7 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
   StackPrinter.emplace(Argc, Argv);
   sys::PrintStackTraceOnErrorSignal(Argv[0]);
   install_out_of_memory_new_handler();
+  RaiseLimits();
 
 #ifdef __MVS__
 

From bb9df2e3bd7ec903f5040ec9e78bdc9e06561d67 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 17 Nov 2025 10:51:13 -0800
Subject: [PATCH 064/105] [lldb] Ensure FILE* access mode is correctly
 specified when creating a NativeFile. (#167764)

If we open a `NativeFile` with a `FILE*`, the OpenOptions default to
`eOpenOptionReadOnly`. This is an issue in python scripts if you try to
write to one of the files like `print("Hi",
file=lldb.debugger.GetOutputFileHandle())`.

To address this, we need to specify the access mode whenever we create a
`NativeFile` from a `FILE*`. I also added an assert on the `NativeFile`
that validates the file is opened with the correct access mode and
updated `NativeFile::Read` and `NativeFile::Write` to check the access
mode.

Before these changes:
```
$ lldb -b -O 'script lldb.debugger.GetOutputFileHandle().write("abc")'
(lldb) script lldb.debugger.GetOutputFileHandle().write("abc")
Traceback (most recent call last):
  File "<input>", line 1, in <module>
io.UnsupportedOperation: not writable
```

After:
```
$ lldb -b -O 'script lldb.debugger.GetOutputFileHandle().write("abc")'
(lldb) script lldb.debugger.GetOutputFileHandle().write("abc")
abc3
```

Fixes #122387
---
 lldb/include/lldb/API/SBFile.h            |  3 ++
 lldb/include/lldb/Host/File.h             |  5 ++-
 lldb/include/lldb/Host/StreamFile.h       |  3 +-
 lldb/source/API/SBCommandReturnObject.cpp |  7 ++--
 lldb/source/API/SBDebugger.cpp            | 16 +++++----
 lldb/source/API/SBFile.cpp                | 17 ++++++++-
 lldb/source/API/SBInstruction.cpp         |  5 +--
 lldb/source/API/SBProcess.cpp             |  4 ++-
 lldb/source/API/SBStream.cpp              |  3 +-
 lldb/source/Core/Debugger.cpp             |  9 +++--
 lldb/source/Host/common/File.cpp          | 42 +++++++++++++++++++----
 lldb/source/Host/common/StreamFile.cpp    |  3 +-
 lldb/unittests/Host/FileTest.cpp          | 22 +++++++++++-
 13 files changed, 113 insertions(+), 26 deletions(-)

diff --git a/lldb/include/lldb/API/SBFile.h b/lldb/include/lldb/API/SBFile.h
index ebdc5607b7942..8cf4fe1b405fa 100644
--- a/lldb/include/lldb/API/SBFile.h
+++ b/lldb/include/lldb/API/SBFile.h
@@ -27,7 +27,10 @@ class LLDB_API SBFile {
   SBFile(FileSP file_sp);
 #ifndef SWIG
   SBFile(const SBFile &rhs);
+  LLDB_DEPRECATED_FIXME("Use the constructor that specifies mode instead",
+                        "SBFile(FILE*, const char*, bool)")
   SBFile(FILE *file, bool transfer_ownership);
+  SBFile(FILE *file, const char *mode, bool transfer_ownership);
 #endif
   SBFile(int fd, const char *mode, bool transfer_ownership);
   ~SBFile();
diff --git a/lldb/include/lldb/Host/File.h b/lldb/include/lldb/Host/File.h
index 7402a2231735a..590c9fa523b29 100644
--- a/lldb/include/lldb/Host/File.h
+++ b/lldb/include/lldb/Host/File.h
@@ -66,6 +66,9 @@ class File : public IOObject {
     LLVM_MARK_AS_BITMASK_ENUM(/* largest_value= */ eOpenOptionInvalid)
   };
 
+  static constexpr OpenOptions OpenOptionsModeMask =
+      eOpenOptionReadOnly | eOpenOptionWriteOnly | eOpenOptionReadWrite;
+
   static mode_t ConvertOpenOptionsForPOSIXOpen(OpenOptions open_options);
   static llvm::Expected<OpenOptions> GetOptionsFromMode(llvm::StringRef mode);
   static bool DescriptorIsValid(int descriptor) { return descriptor >= 0; };
@@ -384,7 +387,7 @@ class NativeFile : public File {
 
   NativeFile();
 
-  NativeFile(FILE *fh, bool transfer_ownership);
+  NativeFile(FILE *fh, OpenOptions options, bool transfer_ownership);
 
   NativeFile(int fd, OpenOptions options, bool transfer_ownership);
 
diff --git a/lldb/include/lldb/Host/StreamFile.h b/lldb/include/lldb/Host/StreamFile.h
index e37661a9938c0..8b01eeab6f586 100644
--- a/lldb/include/lldb/Host/StreamFile.h
+++ b/lldb/include/lldb/Host/StreamFile.h
@@ -81,7 +81,8 @@ class LockableStreamFile {
   LockableStreamFile(StreamFile &stream_file, Mutex &mutex)
       : m_file_sp(stream_file.GetFileSP()), m_mutex(mutex) {}
   LockableStreamFile(FILE *fh, bool transfer_ownership, Mutex &mutex)
-      : m_file_sp(std::make_shared<NativeFile>(fh, transfer_ownership)),
+      : m_file_sp(std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                               transfer_ownership)),
         m_mutex(mutex) {}
   LockableStreamFile(std::shared_ptr<File> file_sp, Mutex &mutex)
       : m_file_sp(file_sp), m_mutex(mutex) {}
diff --git a/lldb/source/API/SBCommandReturnObject.cpp b/lldb/source/API/SBCommandReturnObject.cpp
index e78e213aa23af..da7e288e38d28 100644
--- a/lldb/source/API/SBCommandReturnObject.cpp
+++ b/lldb/source/API/SBCommandReturnObject.cpp
@@ -15,6 +15,7 @@
 #include "lldb/API/SBValue.h"
 #include "lldb/API/SBValueList.h"
 #include "lldb/Core/StructuredDataImpl.h"
+#include "lldb/Host/File.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/Instrumentation.h"
@@ -275,14 +276,16 @@ void SBCommandReturnObject::SetImmediateErrorFile(FILE *fh) {
 void SBCommandReturnObject::SetImmediateOutputFile(FILE *fh,
                                                    bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  FileSP file = std::make_shared<NativeFile>(fh, transfer_ownership);
+  FileSP file = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                             transfer_ownership);
   ref().SetImmediateOutputFile(file);
 }
 
 void SBCommandReturnObject::SetImmediateErrorFile(FILE *fh,
                                                   bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  FileSP file = std::make_shared<NativeFile>(fh, transfer_ownership);
+  FileSP file = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                             transfer_ownership);
   ref().SetImmediateErrorFile(file);
 }
 
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 5c4c653d95a81..7a4bebfdf998e 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -327,8 +327,8 @@ void SBDebugger::SkipAppInitFiles(bool b) {
 void SBDebugger::SetInputFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
   if (m_opaque_sp)
-    m_opaque_sp->SetInputFile(
-        (FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+    m_opaque_sp->SetInputFile((FileSP)std::make_shared<NativeFile>(
+        fh, File::eOpenOptionReadOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetInputString(const char *data) {
@@ -385,7 +385,8 @@ SBError SBDebugger::SetOutputFile(FileSP file_sp) {
 
 void SBDebugger::SetOutputFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  SetOutputFile((FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+  SetOutputFile((FileSP)std::make_shared<NativeFile>(
+      fh, File::eOpenOptionWriteOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetOutputFile(SBFile file) {
@@ -405,7 +406,8 @@ SBError SBDebugger::SetOutputFile(SBFile file) {
 
 void SBDebugger::SetErrorFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  SetErrorFile((FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+  SetErrorFile((FileSP)std::make_shared<NativeFile>(
+      fh, File::eOpenOptionWriteOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetErrorFile(FileSP file_sp) {
@@ -576,8 +578,10 @@ void SBDebugger::HandleProcessEvent(const SBProcess &process,
                                     FILE *err) {
   LLDB_INSTRUMENT_VA(this, process, event, out, err);
 
-  FileSP outfile = std::make_shared<NativeFile>(out, false);
-  FileSP errfile = std::make_shared<NativeFile>(err, false);
+  FileSP outfile =
+      std::make_shared<NativeFile>(out, File::eOpenOptionWriteOnly, false);
+  FileSP errfile =
+      std::make_shared<NativeFile>(err, File::eOpenOptionWriteOnly, false);
   return HandleProcessEvent(process, event, outfile, errfile);
 }
 
diff --git a/lldb/source/API/SBFile.cpp b/lldb/source/API/SBFile.cpp
index 2ae4b1481afbf..56909923d4b2d 100644
--- a/lldb/source/API/SBFile.cpp
+++ b/lldb/source/API/SBFile.cpp
@@ -39,7 +39,22 @@ SBFile::SBFile() { LLDB_INSTRUMENT_VA(this); }
 SBFile::SBFile(FILE *file, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, file, transfer_ownership);
 
-  m_opaque_sp = std::make_shared<NativeFile>(file, transfer_ownership);
+  // For backwards comptability, this defaulted to ReadOnly previously.
+  m_opaque_sp = std::make_shared<NativeFile>(file, File::eOpenOptionReadOnly,
+                                             transfer_ownership);
+}
+
+SBFile::SBFile(FILE *file, const char *mode, bool transfer_ownership) {
+  LLDB_INSTRUMENT_VA(this, file, transfer_ownership);
+
+  auto options = File::GetOptionsFromMode(mode);
+  if (!options) {
+    llvm::consumeError(options.takeError());
+    return;
+  }
+
+  m_opaque_sp =
+      std::make_shared<NativeFile>(file, options.get(), transfer_ownership);
 }
 
 SBFile::SBFile(int fd, const char *mode, bool transfer_owndership) {
diff --git a/lldb/source/API/SBInstruction.cpp b/lldb/source/API/SBInstruction.cpp
index 6755089af39a4..5921511f3b239 100644
--- a/lldb/source/API/SBInstruction.cpp
+++ b/lldb/source/API/SBInstruction.cpp
@@ -10,8 +10,8 @@
 #include "lldb/Utility/Instrumentation.h"
 
 #include "lldb/API/SBAddress.h"
-#include "lldb/API/SBFrame.h"
 #include "lldb/API/SBFile.h"
+#include "lldb/API/SBFrame.h"
 
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBTarget.h"
@@ -268,7 +268,8 @@ bool SBInstruction::GetDescription(lldb::SBStream &s) {
 
 void SBInstruction::Print(FILE *outp) {
   LLDB_INSTRUMENT_VA(this, outp);
-  FileSP out = std::make_shared<NativeFile>(outp, /*take_ownership=*/false);
+  FileSP out = std::make_shared<NativeFile>(outp, File::eOpenOptionWriteOnly,
+                                            /*take_ownership=*/false);
   Print(out);
 }
 
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index d4be64b815369..14aa9432eed83 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBProcess.h"
+#include "lldb/Host/File.h"
 #include "lldb/Utility/Instrumentation.h"
 
 #include <cinttypes>
@@ -310,7 +311,8 @@ void SBProcess::ReportEventState(const SBEvent &event, SBFile out) const {
 
 void SBProcess::ReportEventState(const SBEvent &event, FILE *out) const {
   LLDB_INSTRUMENT_VA(this, event, out);
-  FileSP outfile = std::make_shared<NativeFile>(out, false);
+  FileSP outfile =
+      std::make_shared<NativeFile>(out, File::eOpenOptionWriteOnly, false);
   return ReportEventState(event, outfile);
 }
 
diff --git a/lldb/source/API/SBStream.cpp b/lldb/source/API/SBStream.cpp
index fc8f09a7bb9ae..2fc5fcfa8b0c4 100644
--- a/lldb/source/API/SBStream.cpp
+++ b/lldb/source/API/SBStream.cpp
@@ -116,7 +116,8 @@ void SBStream::RedirectToFile(const char *path, bool append) {
 
 void SBStream::RedirectToFileHandle(FILE *fh, bool transfer_fh_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_fh_ownership);
-  FileSP file = std::make_unique<NativeFile>(fh, transfer_fh_ownership);
+  FileSP file = std::make_unique<NativeFile>(fh, File::eOpenOptionReadWrite,
+                                             transfer_fh_ownership);
   return RedirectToFile(file);
 }
 
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index b37d9d3ed85e3..02f38e9094ec5 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -965,7 +965,8 @@ llvm::StringRef Debugger::GetStaticBroadcasterClass() {
 Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
     : UserID(g_unique_id++),
       Properties(std::make_shared<OptionValueProperties>()),
-      m_input_file_sp(std::make_shared<NativeFile>(stdin, NativeFile::Unowned)),
+      m_input_file_sp(std::make_shared<NativeFile>(
+          stdin, File::eOpenOptionReadOnly, NativeFile::Unowned)),
       m_output_stream_sp(std::make_shared<LockableStreamFile>(
           stdout, NativeFile::Unowned, m_output_mutex)),
       m_error_stream_sp(std::make_shared<LockableStreamFile>(
@@ -1172,7 +1173,8 @@ Status Debugger::SetInputString(const char *data) {
     return result;
   }
 
-  SetInputFile((FileSP)std::make_shared<NativeFile>(commands_file, true));
+  SetInputFile((FileSP)std::make_shared<NativeFile>(
+      commands_file, File::eOpenOptionReadOnly, true));
   return result;
 }
 
@@ -1378,7 +1380,8 @@ void Debugger::AdoptTopIOHandlerFilesIfInvalid(FileSP &in,
       in = GetInputFileSP();
     // If there is nothing, use stdin
     if (!in)
-      in = std::make_shared<NativeFile>(stdin, NativeFile::Unowned);
+      in = std::make_shared<NativeFile>(stdin, File::eOpenOptionReadOnly,
+                                        NativeFile::Unowned);
   }
   // If no STDOUT has been set, then set it appropriately
   if (!out || !out->GetUnlockedFile().IsValid()) {
diff --git a/lldb/source/Host/common/File.cpp b/lldb/source/Host/common/File.cpp
index 65b75bd647c5d..4fad93fca9ea3 100644
--- a/lldb/source/Host/common/File.cpp
+++ b/lldb/source/Host/common/File.cpp
@@ -249,8 +249,8 @@ uint32_t File::GetPermissions(Status &error) const {
 
 NativeFile::NativeFile() = default;
 
-NativeFile::NativeFile(FILE *fh, bool transfer_ownership)
-    : m_stream(fh), m_own_stream(transfer_ownership) {
+NativeFile::NativeFile(FILE *fh, OpenOptions options, bool transfer_ownership)
+    : m_stream(fh), m_options(options), m_own_stream(transfer_ownership) {
 #ifdef _WIN32
   // In order to properly display non ASCII characters in Windows, we need to
   // use Windows APIs to print to the console. This is only required if the
@@ -258,6 +258,26 @@ NativeFile::NativeFile(FILE *fh, bool transfer_ownership)
   int fd = _fileno(fh);
   is_windows_console =
       ::GetFileType((HANDLE)::_get_osfhandle(fd)) == FILE_TYPE_CHAR;
+#else
+#ifndef NDEBUG
+  int fd = fileno(fh);
+  if (fd != -1) {
+    int required_mode = ConvertOpenOptionsForPOSIXOpen(options) & O_ACCMODE;
+    int mode = fcntl(fd, F_GETFL);
+    if (mode != -1) {
+      mode &= O_ACCMODE;
+      // Check that the file is open with a valid subset of the requested file
+      // access mode, e.g. if we expected the file to be writable then ensure it
+      // was opened with O_WRONLY or O_RDWR.
+      assert(
+          (required_mode == O_RDWR && mode == O_RDWR) ||
+          (required_mode == O_RDONLY && (mode == O_RDWR || mode == O_RDONLY) ||
+           (required_mode == O_WRONLY &&
+            (mode == O_RDWR || mode == O_WRONLY))) &&
+              "invalid file access mode");
+    }
+  }
+#endif
 #endif
 }
 
@@ -274,7 +294,8 @@ NativeFile::NativeFile(int fd, OpenOptions options, bool transfer_ownership)
 }
 
 bool NativeFile::IsValid() const {
-  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex, m_stream_mutex);
+  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex,
+                                                m_stream_mutex);
   return DescriptorIsValidUnlocked() || StreamIsValidUnlocked();
 }
 
@@ -343,7 +364,8 @@ FILE *NativeFile::GetStream() {
 }
 
 Status NativeFile::Close() {
-  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex, m_stream_mutex);
+  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex,
+                                                m_stream_mutex);
 
   Status error;
 
@@ -548,6 +570,10 @@ Status NativeFile::Sync() {
 Status NativeFile::Read(void *buf, size_t &num_bytes) {
   Status error;
 
+  // Ensure the file is open for reading.
+  if ((m_options & File::OpenOptionsModeMask) == eOpenOptionWriteOnly)
+    return Status(std::make_error_code(std::errc::bad_file_descriptor));
+
 #if defined(MAX_READ_SIZE)
   if (num_bytes > MAX_READ_SIZE) {
     uint8_t *p = (uint8_t *)buf;
@@ -612,6 +638,10 @@ Status NativeFile::Read(void *buf, size_t &num_bytes) {
 Status NativeFile::Write(const void *buf, size_t &num_bytes) {
   Status error;
 
+  // Ensure the file is open for writing.
+  if ((m_options & File::OpenOptionsModeMask) == File::eOpenOptionReadOnly)
+    return Status(std::make_error_code(std::errc::bad_file_descriptor));
+
 #if defined(MAX_WRITE_SIZE)
   if (num_bytes > MAX_WRITE_SIZE) {
     const uint8_t *p = (const uint8_t *)buf;
@@ -776,8 +806,8 @@ Status NativeFile::Write(const void *buf, size_t &num_bytes, off_t &offset) {
   int fd = GetDescriptor();
   if (fd != kInvalidDescriptor) {
 #ifndef _WIN32
-    ssize_t bytes_written =
-        llvm::sys::RetryAfterSignal(-1, ::pwrite, m_descriptor, buf, num_bytes, offset);
+    ssize_t bytes_written = llvm::sys::RetryAfterSignal(
+        -1, ::pwrite, m_descriptor, buf, num_bytes, offset);
     if (bytes_written < 0) {
       num_bytes = 0;
       error = Status::FromErrno();
diff --git a/lldb/source/Host/common/StreamFile.cpp b/lldb/source/Host/common/StreamFile.cpp
index 099980a0993c6..131412d81983b 100644
--- a/lldb/source/Host/common/StreamFile.cpp
+++ b/lldb/source/Host/common/StreamFile.cpp
@@ -27,7 +27,8 @@ StreamFile::StreamFile(int fd, bool transfer_ownership) : Stream() {
 }
 
 StreamFile::StreamFile(FILE *fh, bool transfer_ownership) : Stream() {
-  m_file_sp = std::make_shared<NativeFile>(fh, transfer_ownership);
+  m_file_sp = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                           transfer_ownership);
 }
 
 StreamFile::StreamFile(const char *path, File::OpenOptions options,
diff --git a/lldb/unittests/Host/FileTest.cpp b/lldb/unittests/Host/FileTest.cpp
index d973d19430596..85697c49f6fce 100644
--- a/lldb/unittests/Host/FileTest.cpp
+++ b/lldb/unittests/Host/FileTest.cpp
@@ -8,6 +8,7 @@
 
 #include "lldb/Host/File.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Path.h"
@@ -35,7 +36,7 @@ TEST(File, GetWaitableHandleFileno) {
   FILE *stream = fdopen(fd, "r");
   ASSERT_TRUE(stream);
 
-  NativeFile file(stream, true);
+  NativeFile file(stream, File::eOpenOptionReadWrite, true);
 #ifdef _WIN32
   EXPECT_EQ(file.GetWaitableHandle(), (HANDLE)_get_osfhandle(fd));
 #else
@@ -67,3 +68,22 @@ TEST(File, GetStreamFromDescriptor) {
   EXPECT_EQ(file.GetWaitableHandle(), (file_t)fd);
 #endif
 }
+
+TEST(File, ReadOnlyModeNotWritable) {
+  const auto *Info = testing::UnitTest::GetInstance()->current_test_info();
+  llvm::SmallString<128> name;
+  int fd;
+  llvm::sys::fs::createTemporaryFile(llvm::Twine(Info->test_case_name()) + "-" +
+                                         Info->name(),
+                                     "test", fd, name);
+
+  llvm::FileRemover remover(name);
+  ASSERT_GE(fd, 0);
+
+  NativeFile file(fd, File::eOpenOptionReadOnly, true);
+  ASSERT_TRUE(file.IsValid());
+  llvm::StringLiteral buf = "Hello World";
+  size_t bytes_written = buf.size();
+  Status error = file.Write(buf.data(), bytes_written);
+  EXPECT_EQ(error.Fail(), true);
+}

From aba3269bc98733f56a4b694dddf816c32ff50341 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 17 Nov 2025 10:55:27 -0800
Subject: [PATCH 065/105] [.gitignore] Ignore .claude and .gemini in
 subdirectories (#167029)

Currently `.claude/` and `.gemini/` are only ignored in the root of the
repo. Developers might conceivable run these tools in project
subdirectories, in which case these should be ignored as well.
---
 .gitignore | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 860b8ea12abd4..a9d616286adf1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,9 +54,9 @@ autoconf/autom4te.cache
 /cmake-build*
 # Coding assistants' stuff
 /CLAUDE.md
-/.claude/
+.claude/
 /GEMINI.md
-/.gemini/
+.gemini/
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).

From adeedad449d19087baa5ec4fbc246d1f6664b7d4 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 17 Nov 2025 10:56:00 -0800
Subject: [PATCH 066/105] [bazel] Port 900c517919794ff0ea83c6b15ffe03707a164800
 (#168423)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index a94442af376e5..4932e674ba85c 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2435,6 +2435,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-searchable-tables"],
                 "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AMDGPU/AMDGPUGenSDNodeInfo.inc",
+            ),
         ],
         "tbl_deps": [
             ":InstCombineTableGen",

From 18b5e2a7266bfe8f211be7ae1198e6bed4ab0c06 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 17 Nov 2025 10:56:28 -0800
Subject: [PATCH 067/105] [lldb] Push down the SWIG module to avoid an import
 cycle (#166265)

This is a reland of #129135 (by dingxiangfei2009) with Vladislav
(dzhidzhoev) fix on top.

Fixes #92603
---
 lldb/bindings/python/CMakeLists.txt                       | 8 ++++++--
 lldb/bindings/python/python.swig                          | 7 ++++++-
 .../ScriptInterpreter/Python/ScriptInterpreterPython.cpp  | 1 +
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt
index 28a8af8f06319..2ebcf5a8e7aca 100644
--- a/lldb/bindings/python/CMakeLists.txt
+++ b/lldb/bindings/python/CMakeLists.txt
@@ -60,8 +60,10 @@ endfunction()
 function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_target_dir)
   # Add a Post-Build Event to copy over Python files and create the symlink to
   # liblldb.so for the Python API(hardlink on Windows).
+  # Note that Swig-generated code is located one level deeper in the `native`
+  # module, in order to avoid cyclic importing.
   add_custom_target(${swig_target} ALL VERBATIM
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir}/native/
     DEPENDS ${lldb_python_bindings_dir}/lldb.py
     COMMENT "Python script sym-linking LLDB Python API")
 
@@ -75,6 +77,8 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
       "${LLDB_SOURCE_DIR}/source/Interpreter/embedded_interpreter.py"
       "${lldb_python_target_dir}")
 
+  create_python_package(${swig_target} ${lldb_python_target_dir} "native" FILES)
+
   # Distribute the examples as python packages.
   create_python_package(
     ${swig_target}
@@ -143,7 +147,7 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
   endif()
   set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb${LLDB_PYTHON_EXT_SUFFIX}")
   create_relative_symlink(${swig_target} ${LIBLLDB_SYMLINK_DEST}
-                          ${lldb_python_target_dir} ${LIBLLDB_SYMLINK_OUTPUT_FILE})
+                          ${lldb_python_target_dir}/native/ ${LIBLLDB_SYMLINK_OUTPUT_FILE})
 
 
   if (NOT WIN32)
diff --git a/lldb/bindings/python/python.swig b/lldb/bindings/python/python.swig
index b2823f98acac8..3d2caa65f1658 100644
--- a/lldb/bindings/python/python.swig
+++ b/lldb/bindings/python/python.swig
@@ -50,7 +50,12 @@ Older swig versions will simply ignore this setting.
     import $module
 except ImportError:
     # Relative import should work if we are being loaded by Python.
-    from . import $module"
+    # The cpython module built by swig is pushed one level down into
+    # the native submodule, because at this point the interpreter
+    # is still constructing the lldb module itself.
+    # Simply importing anything using `from . import` constitutes
+    # a cyclic importing.
+    from .native import $module"
 %enddef
 
 // The name of the module to be created.
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 3493fa9fef635..35a772c1454df 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -272,6 +272,7 @@ void ScriptInterpreterPython::SharedLibraryDirectoryHelper(
   // does.
   if (this_file.GetFileNameExtension() == ".pyd") {
     this_file.RemoveLastPathComponent(); // _lldb.pyd or _lldb_d.pyd
+    this_file.RemoveLastPathComponent(); // native
     this_file.RemoveLastPathComponent(); // lldb
     llvm::StringRef libdir = LLDB_PYTHON_RELATIVE_LIBDIR;
     for (auto it = llvm::sys::path::begin(libdir),

From 7672a5cee12a299a083b93a6d304b27ab3f4707f Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Mon, 17 Nov 2025 10:59:04 -0800
Subject: [PATCH 068/105] [scudo] Fix wrong return type. (#168157)

---
 compiler-rt/lib/scudo/standalone/primary64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 747b1a2233d32..c2401c86671d0 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1394,7 +1394,7 @@ uptr SizeClassAllocator64<Config>::releaseToOSMaybe(RegionInfo *Region,
                                             Region->FreeListInfo.PushedBlocks) *
                                                BlockSize;
     if (UNLIKELY(BytesInFreeList == 0))
-      return false;
+      return 0;
 
     // ==================================================================== //
     // 1. Check if we have enough free blocks and if it's worth doing a page

From cd5d5b31bff0052be214357133ad3dd7f3f24a74 Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Mon, 17 Nov 2025 20:00:03 +0100
Subject: [PATCH 069/105] [mlir][XeGPU] Use DistributeLayoutAttr instead of
 LayoutAttr for load gather/scatter ops (#167850)

The PR changes the layout attribute type for
`xegpu::LoadGatherOp/StoreScatterOp` from `LayoutAttr` to
`DistributeLayoutAttr` to also support `xegpu.slice` layouts.

Initially we [wanted to restrict slice
layouts](https://github.com/llvm/llvm-project/pull/163414#discussion_r2478978798)
from the attribute, but now it turns out there are actually valid use
cases for that:
```mlir
gpu.func @distribute_load_slice_attr() {
  %2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
  %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
  %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>

  %3 = xegpu.load %2[%offset], %mask <{chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]>>} {
      layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]>
  } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>

  %4 = vector.broadcast %3 {layout_result_0 =
      #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
  gpu.return
}
```

Signed-off-by: dchigarev <dmitry.chigarev@intel.com>
---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td  |  8 ++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp          |  4 ++--
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp    |  4 ++--
 .../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp  | 12 +++++++-----
 .../Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 17 +++++++++++++++++
 5 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 689ebd0d1179a..4c67856b559b1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -844,7 +844,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<XeGPU_LayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -903,7 +903,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::LayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -988,7 +988,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<XeGPU_LayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1046,7 +1046,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::LayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4dd10bedc6d84..85c9a966f0fe8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -901,7 +901,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint,
-                         xegpu::LayoutAttr layout) {
+                         DistributeLayoutAttr layout) {
   auto loc = source.getLoc();
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
@@ -985,7 +985,7 @@ void StoreScatterOp::build(
     OpBuilder &builder, OperationState &state, Value value, Value dest,
     ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
     xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
-    xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) {
+    xegpu::CachePolicyAttr l3_hint, DistributeLayoutAttr layout) {
   auto loc = dest.getLoc();
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c3bf9606693a8..330553564f81a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
-    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
@@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
-    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0a9ef0aa6df96..33d4b0457e5d3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -889,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
-        xegpu::getDistributeLayoutAttr(op.getResult()));
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -913,10 +913,12 @@ struct WgToSgLoadGatherOpWithOffset
     VectorType newTy = VectorType::get(sgShape, resultType.getElementType());
     for (auto [offsets, mask] :
          llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
+      auto newLayout = layout.dropSgLayoutAndData();
       auto newLoadOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
           op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
-          layout.dropSgLayoutAndData());
+          newLayout);
+      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout);
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -941,8 +943,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
-        xegpu::getDistributeLayoutAttr(op.getOperand(0)));
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getOperand(0));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 4fbb566cfbe73..5dde84e8e0bc2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -547,4 +547,21 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: distribute_load_slice_attr
+  gpu.func @distribute_load_slice_attr() {
+    %2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
+    %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
+    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
+
+    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
+    // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
+    // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+    %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
+
+    // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<32xf32> to vector<32x32xf32>
+    %4 = vector.broadcast %3 {layout_result_0 =
+        #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+    gpu.return
+  }
 }

From bafb3f67880d716fcc0ad14f10d8a98699591cd5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 19:01:25 +0000
Subject: [PATCH 070/105] [LV] Add test with existing noalias metadata and
 runtime checks.

Add test where we have loads with existing noalias metadata and noalias
metadata gets added by loop versioning.
---
 .../test/Transforms/LoopVectorize/metadata.ll | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index fe25d1b231efc..ed027e8b9a895 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -497,6 +497,129 @@ exit:
   ret void
 }
 
+define void @noalias_metadata(ptr align 8 %dst, ptr align 8 %src) {
+; CHECK-LABEL: define void @noalias_metadata(
+; CHECK-SAME: ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST3]], [[SRC4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[DST1]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], [[SRC2]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP23]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[NEXT_GEP]], align 8, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store ptr [[TMP7]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load ptr, ptr [[PTR]], align 8
+; CHECK-NEXT:    store ptr [[VAL]], ptr [[DST]], align 8, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR]], [[DST]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; INTERLEAVE-LABEL: define void @noalias_metadata(
+; INTERLEAVE-SAME: ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64
+; INTERLEAVE-NEXT:    [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64
+; INTERLEAVE-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; INTERLEAVE-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = sub i64 [[DST3]], [[SRC4]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; INTERLEAVE:       [[VECTOR_MEMCHECK]]:
+; INTERLEAVE-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i64 [[DST1]], 8
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], [[SRC2]]
+; INTERLEAVE-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
+; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]]
+; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; INTERLEAVE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; INTERLEAVE-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 8
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP23]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = mul i64 [[INDEX]], 8
+; INTERLEAVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 2
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8, !alias.scope [[META14:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i32 1
+; INTERLEAVE-NEXT:    store ptr [[TMP8]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; INTERLEAVE-NEXT:    [[VAL:%.*]] = load ptr, ptr [[PTR]], align 8
+; INTERLEAVE-NEXT:    store ptr [[VAL]], ptr [[DST]], align 8, !noalias [[META23:![0-9]+]]
+; INTERLEAVE-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR]], [[DST]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr = phi ptr [ %ptr.next, %loop ], [ %src, %entry ]
+  %val = load ptr, ptr %ptr, align 8
+  store ptr %val, ptr %dst, align 8, !noalias !4
+  %ptr.next = getelementptr inbounds i8, ptr %ptr, i64 8
+  %cmp = icmp eq ptr %ptr, %dst
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 declare i64 @foo(i64)
 declare double @bar(double)
 
@@ -510,6 +633,9 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 !1 = !{ i64 0, i64 2 }
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"omnipotent char", !2, i64 0}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"g1"}
+!6 = distinct !{!6, !"t2"}
 
 ;.
 ; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
@@ -526,6 +652,17 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
+; CHECK: [[META14]] = !{[[META15:![0-9]+]]}
+; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]}
+; CHECK: [[META16]] = distinct !{[[META16]], !"LVerDomain"}
+; CHECK: [[META17]] = !{[[META18:![0-9]+]]}
+; CHECK: [[META18]] = distinct !{[[META18]], [[META16]]}
+; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META15]]}
+; CHECK: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"}
+; CHECK: [[META21]] = distinct !{[[META21]], !"t2"}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]}
+; CHECK: [[META23]] = !{[[META20]]}
+; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]}
 ;.
 ; INTERLEAVE: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
 ; INTERLEAVE: [[META1]] = !{!"omnipotent char", [[META2]]}
@@ -541,4 +678,15 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; INTERLEAVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
+; INTERLEAVE: [[META14]] = !{[[META15:![0-9]+]]}
+; INTERLEAVE: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]}
+; INTERLEAVE: [[META16]] = distinct !{[[META16]], !"LVerDomain"}
+; INTERLEAVE: [[META17]] = !{[[META18:![0-9]+]]}
+; INTERLEAVE: [[META18]] = distinct !{[[META18]], [[META16]]}
+; INTERLEAVE: [[META19]] = !{[[META20:![0-9]+]], [[META15]]}
+; INTERLEAVE: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"}
+; INTERLEAVE: [[META21]] = distinct !{[[META21]], !"t2"}
+; INTERLEAVE: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]}
+; INTERLEAVE: [[META23]] = !{[[META20]]}
+; INTERLEAVE: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]}
 ;.

From af6af8e4eb778acc1b655574cdf2a4086a9fdcce Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 17 Nov 2025 11:08:22 -0800
Subject: [PATCH 071/105] [bazel] Port 0a58e49c44ae7cca39b3eb219efed9f0581b8b0f
 (#168424)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 4932e674ba85c..d582f448c2213 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3225,6 +3225,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-subtarget"],
                 "lib/Target/VE/VEGenSubtargetInfo.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/VE/VEGenSDNodeInfo.inc",
+            ),
         ],
     },
     {

From c555522818ff3acaa928f4147546ecec81e579eb Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 17 Nov 2025 11:08:49 -0800
Subject: [PATCH 072/105] [lldb-dap] Migrating 'evaluate' to structured types.
 (#167720)

Adding structured types for the evaluate request handler.

This should be mostly a non-functional change. I did catch some spelling
mistakes in our tests ('variable' vs 'variables').
---
 .../test/tools/lldb-dap/dap_server.py         |   3 +-
 .../lldb-dap/evaluate/TestDAP_evaluate.py     | 194 ++++++++++---
 .../Handler/EvaluateRequestHandler.cpp        | 265 +++++-------------
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |   9 +-
 lldb/tools/lldb-dap/JSONUtils.cpp             |   5 +-
 lldb/tools/lldb-dap/JSONUtils.h               |   6 +-
 lldb/tools/lldb-dap/LLDBUtils.cpp             |  11 +-
 lldb/tools/lldb-dap/LLDBUtils.h               |   2 +-
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  49 ++++
 .../lldb-dap/Protocol/ProtocolRequests.h      | 117 ++++++++
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |   3 +-
 lldb/unittests/DAP/ProtocolRequestsTest.cpp   |  51 ++++
 12 files changed, 461 insertions(+), 254 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index ac550962cfb85..a4ca090021f3f 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -978,9 +978,10 @@ def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None
             return []
         args_dict = {
             "expression": expression,
-            "context": context,
             "frameId": stackFrame["id"],
         }
+        if context:
+            args_dict["context"] = context
         command_dict = {
             "command": "evaluate",
             "type": "request",
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 20a75f4076e42..3c233a5b43ebb 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -1,5 +1,5 @@
 """
-Test lldb-dap completions request
+Test lldb-dap evaluate request
 """
 
 import re
@@ -7,16 +7,67 @@
 import lldbdap_testcase
 from lldbsuite.test.decorators import skipIfWindows
 from lldbsuite.test.lldbtest import line_number
+from typing import TypedDict, Optional
+
+
+class EvaluateResponseBody(TypedDict, total=False):
+    result: str
+    variablesReference: int
+    type: Optional[str]
+    memoryReference: Optional[str]
+    valueLocationReference: Optional[int]
 
 
 class TestDAP_evaluate(lldbdap_testcase.DAPTestCaseBase):
-    def assertEvaluate(self, expression, regex):
+    def assertEvaluate(
+        self,
+        expression,
+        result: str,
+        want_type="",
+        want_varref=False,
+        want_memref=True,
+        want_locref=False,
+    ):
+        resp = self.dap_server.request_evaluate(expression, context=self.context)
+        self.assertTrue(
+            resp["success"], f"Failed to evaluate expression {expression!r}"
+        )
+        body: EvaluateResponseBody = resp["body"]
         self.assertRegex(
-            self.dap_server.request_evaluate(expression, context=self.context)["body"][
-                "result"
-            ],
-            regex,
+            body["result"],
+            result,
+            f"Unexpected 'result' for expression {expression!r} in response body {body}",
         )
+        if want_varref:
+            self.assertNotEqual(
+                body["variablesReference"],
+                0,
+                f"Unexpected 'variablesReference' for expression {expression!r} in response body {body}",
+            )
+        else:
+            self.assertEqual(
+                body["variablesReference"],
+                0,
+                f"Unexpected 'variablesReference' for expression {expression!r} in response body {body}",
+            )
+        if want_type:
+            self.assertEqual(
+                body["type"],
+                want_type,
+                f"Unexpected 'type' for expression {expression!r} in response body {body}",
+            )
+        if want_memref:
+            self.assertIn(
+                "memoryReference",
+                body,
+                f"Unexpected 'memoryReference' for expression {expression!r} in response body {body}",
+            )
+        if want_locref:
+            self.assertIn(
+                "valueLocationReference",
+                body,
+                f"Unexpected 'valueLocationReference' for expression {expression!r} in response body {body}",
+            )
 
     def assertEvaluateFailure(self, expression):
         self.assertNotIn(
@@ -71,29 +122,39 @@ def run_test_evaluate_expressions(
         self.continue_to_breakpoint(breakpoint_1)
 
         # Expressions at breakpoint 1, which is in main
-        self.assertEvaluate("var1", "20")
+        self.assertEvaluate("var1", "20", want_type="int")
         # Empty expression should equate to the previous expression.
         if context == "repl":
             self.assertEvaluate("", "20")
         else:
             self.assertEvaluateFailure("")
-        self.assertEvaluate("var2", "21")
+        self.assertEvaluate("var2", "21", want_type="int")
         if context == "repl":
-            self.assertEvaluate("", "21")
-            self.assertEvaluate("", "21")
-        self.assertEvaluate("static_int", "42")
-        self.assertEvaluate("non_static_int", "43")
-        self.assertEvaluate("struct1.foo", "15")
-        self.assertEvaluate("struct2->foo", "16")
+            self.assertEvaluate("", "21", want_type="int")
+            self.assertEvaluate("", "21", want_type="int")
+        self.assertEvaluate("static_int", "42", want_type="int")
+        self.assertEvaluate("non_static_int", "43", want_type="int")
+        self.assertEvaluate("struct1.foo", "15", want_type="int")
+        self.assertEvaluate("struct2->foo", "16", want_type="int")
 
         if self.isResultExpandedDescription():
             self.assertEvaluate(
                 "struct1",
                 r"\(my_struct\) (struct1|\$\d+) = \(foo = 15\)",
+                want_type="my_struct",
+                want_varref=True,
+            )
+            self.assertEvaluate(
+                "struct2",
+                r"\(my_struct \*\) (struct2|\$\d+) = 0x.*",
+                want_type="my_struct *",
+                want_varref=True,
             )
-            self.assertEvaluate("struct2", r"\(my_struct \*\) (struct2|\$\d+) = 0x.*")
             self.assertEvaluate(
-                "struct3", r"\(my_struct \*\) (struct3|\$\d+) = nullptr"
+                "struct3",
+                r"\(my_struct \*\) (struct3|\$\d+) = nullptr",
+                want_type="my_struct *",
+                want_varref=True,
             )
         else:
             self.assertEvaluate(
@@ -103,16 +164,22 @@ def run_test_evaluate_expressions(
                     if enableAutoVariableSummaries
                     else "my_struct @ 0x"
                 ),
+                want_varref=True,
+            )
+            self.assertEvaluate(
+                "struct2",
+                "0x.* {foo:16}" if enableAutoVariableSummaries else "0x.*",
+                want_varref=True,
+                want_type="my_struct *",
             )
             self.assertEvaluate(
-                "struct2", "0x.* {foo:16}" if enableAutoVariableSummaries else "0x.*"
+                "struct3", "0x.*0", want_varref=True, want_type="my_struct *"
             )
-            self.assertEvaluate("struct3", "0x.*0")
 
         if context == "repl":
             # In the repl context expressions may be interpreted as lldb
             # commands since no variables have the same name as the command.
-            self.assertEvaluate("list", r".*")
+            self.assertEvaluate("list", r".*", want_memref=False)
         else:
             self.assertEvaluateFailure("list")  # local variable of a_function
 
@@ -121,10 +188,26 @@ def run_test_evaluate_expressions(
         self.assertEvaluateFailure("foo")  # member of my_struct
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("var2 + struct1.foo", "36")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_type="int (*)(int)",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate(
+                "a_function(1)", "1", want_memref=False, want_type="int"
+            )
+            self.assertEvaluate("var2 + struct1.foo", "36", want_memref=False)
+            self.assertEvaluate(
+                "foo_func",
+                "0x.*a.out`foo_func.*",
+                want_type="int (*)()",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -145,6 +228,8 @@ def run_test_evaluate_expressions(
             self.assertEvaluate(
                 "struct1",
                 r"\(my_struct\) (struct1|\$\d+) = \(foo = 15\)",
+                want_type="my_struct",
+                want_varref=True,
             )
         else:
             self.assertEvaluate(
@@ -154,15 +239,26 @@ def run_test_evaluate_expressions(
                     if enableAutoVariableSummaries
                     else "my_struct @ 0x"
                 ),
+                want_type="my_struct",
+                want_varref=True,
             )
         self.assertEvaluate("struct1.foo", "15")
         self.assertEvaluate("struct2->foo", "16")
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("var2 + struct1.foo", "17")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_type="int (*)(int)",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate("a_function(1)", "1", want_memref=False)
+            self.assertEvaluate("var2 + struct1.foo", "17", want_memref=False)
+            self.assertEvaluate(
+                "foo_func", "0x.*a.out`foo_func.*", want_varref=True, want_memref=False
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -185,10 +281,18 @@ def run_test_evaluate_expressions(
         self.assertEvaluateFailure("var2 + struct1.foo")
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("list + 1", "43")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate("a_function(1)", "1", want_memref=False)
+            self.assertEvaluate("list + 1", "43", want_memref=False)
+            self.assertEvaluate(
+                "foo_func", "0x.*a.out`foo_func.*", want_varref=True, want_memref=False
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -199,26 +303,28 @@ def run_test_evaluate_expressions(
 
         # Now we check that values are updated after stepping
         self.continue_to_breakpoint(breakpoint_4)
-        self.assertEvaluate("my_vec", "size=2")
+        self.assertEvaluate("my_vec", "size=2", want_varref=True)
         self.continue_to_breakpoint(breakpoint_5)
-        self.assertEvaluate("my_vec", "size=3")
+        self.assertEvaluate("my_vec", "size=3", want_varref=True)
 
-        self.assertEvaluate("my_map", "size=2")
+        self.assertEvaluate("my_map", "size=2", want_varref=True)
         self.continue_to_breakpoint(breakpoint_6)
-        self.assertEvaluate("my_map", "size=3")
+        self.assertEvaluate("my_map", "size=3", want_varref=True)
 
-        self.assertEvaluate("my_bool_vec", "size=1")
+        self.assertEvaluate("my_bool_vec", "size=1", want_varref=True)
         self.continue_to_breakpoint(breakpoint_7)
-        self.assertEvaluate("my_bool_vec", "size=2")
+        self.assertEvaluate("my_bool_vec", "size=2", want_varref=True)
 
         self.continue_to_breakpoint(breakpoint_8)
         # Test memory read, especially with 'empty' repeat commands.
         if context == "repl":
-            self.assertEvaluate("memory read -c 1 &my_ints", ".* 05 .*\n")
-            self.assertEvaluate("", ".* 0a .*\n")
-            self.assertEvaluate("", ".* 0f .*\n")
-            self.assertEvaluate("", ".* 14 .*\n")
-            self.assertEvaluate("", ".* 19 .*\n")
+            self.assertEvaluate(
+                "memory read -c 1 &my_ints", ".* 05 .*\n", want_memref=False
+            )
+            self.assertEvaluate("", ".* 0a .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 0f .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 14 .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 19 .*\n", want_memref=False)
 
         self.continue_to_exit()
 
@@ -245,4 +351,6 @@ def test_hover_evaluate_expressions(self):
     @skipIfWindows
     def test_variable_evaluate_expressions(self):
         # Tests expression evaluations that are triggered in the variable explorer
-        self.run_test_evaluate_expressions("variable", enableAutoVariableSummaries=True)
+        self.run_test_evaluate_expressions(
+            "variables", enableAutoVariableSummaries=True
+        )
diff --git a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
index e1556846dff19..ea8c3a2a4a296 100644
--- a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
@@ -10,148 +10,31 @@
 #include "EventHelper.h"
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
+#include "Protocol/ProtocolRequests.h"
+#include "Protocol/ProtocolTypes.h"
 #include "RequestHandler.h"
+#include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace lldb_dap;
+using namespace lldb_dap::protocol;
 
 namespace lldb_dap {
 
-//  "EvaluateRequest": {
-//    "allOf": [ { "$ref": "#/definitions/Request" }, {
-//      "type": "object",
-//      "description": "Evaluate request; value of command field is 'evaluate'.
-//                      Evaluates the given expression in the context of the
-//                      top most stack frame. The expression has access to any
-//                      variables and arguments that are in scope.",
-//      "properties": {
-//        "command": {
-//          "type": "string",
-//          "enum": [ "evaluate" ]
-//        },
-//        "arguments": {
-//          "$ref": "#/definitions/EvaluateArguments"
-//        }
-//      },
-//      "required": [ "command", "arguments"  ]
-//    }]
-//  },
-//  "EvaluateArguments": {
-//    "type": "object",
-//    "description": "Arguments for 'evaluate' request.",
-//    "properties": {
-//      "expression": {
-//        "type": "string",
-//        "description": "The expression to evaluate."
-//      },
-//      "frameId": {
-//        "type": "integer",
-//        "description": "Evaluate the expression in the scope of this stack
-//                        frame. If not specified, the expression is evaluated
-//                        in the global scope."
-//      },
-//      "context": {
-//        "type": "string",
-//        "_enum": [ "watch", "repl", "hover" ],
-//        "enumDescriptions": [
-//          "evaluate is run in a watch.",
-//          "evaluate is run from REPL console.",
-//          "evaluate is run from a data hover."
-//        ],
-//        "description": "The context in which the evaluate request is run."
-//      },
-//      "format": {
-//        "$ref": "#/definitions/ValueFormat",
-//        "description": "Specifies details on how to format the Evaluate
-//                        result."
-//      }
-//    },
-//    "required": [ "expression" ]
-//  },
-//  "EvaluateResponse": {
-//    "allOf": [ { "$ref": "#/definitions/Response" }, {
-//      "type": "object",
-//      "description": "Response to 'evaluate' request.",
-//      "properties": {
-//        "body": {
-//          "type": "object",
-//          "properties": {
-//            "result": {
-//              "type": "string",
-//              "description": "The result of the evaluate request."
-//            },
-//            "type": {
-//              "type": "string",
-//              "description": "The optional type of the evaluate result."
-//            },
-//            "presentationHint": {
-//              "$ref": "#/definitions/VariablePresentationHint",
-//              "description": "Properties of a evaluate result that can be
-//                              used to determine how to render the result in
-//                              the UI."
-//            },
-//            "variablesReference": {
-//              "type": "number",
-//              "description": "If variablesReference is > 0, the evaluate
-//                              result is structured and its children can be
-//                              retrieved by passing variablesReference to the
-//                              VariablesRequest."
-//            },
-//            "namedVariables": {
-//              "type": "number",
-//              "description": "The number of named child variables. The
-//                              client can use this optional information to
-//                              present the variables in a paged UI and fetch
-//                              them in chunks."
-//            },
-//            "indexedVariables": {
-//              "type": "number",
-//              "description": "The number of indexed child variables. The
-//                              client can use this optional information to
-//                              present the variables in a paged UI and fetch
-//                              them in chunks."
-//            },
-//            "valueLocationReference": {
-//              "type": "integer",
-//              "description": "A reference that allows the client to request
-//                              the location where the returned value is
-//                              declared. For example, if a function pointer is
-//                              returned, the adapter may be able to look up the
-//                              function's location. This should be present only
-//                              if the adapter is likely to be able to resolve
-//                              the location.\n\nThis reference shares the same
-//                              lifetime as the `variablesReference`. See
-//                              'Lifetime of Object References' in the
-//              Overview section for details."
-//            }
-//            "memoryReference": {
-//               "type": "string",
-//                "description": "A memory reference to a location appropriate
-//                                for this result. For pointer type eval
-//                                results, this is generally a reference to the
-//                                memory address contained in the pointer. This
-//                                attribute may be returned by a debug adapter
-//                                if corresponding capability
-//                                `supportsMemoryReferences` is true."
-//             },
-//          },
-//          "required": [ "result", "variablesReference" ]
-//        }
-//      },
-//      "required": [ "body" ]
-//    }]
-//  }
-void EvaluateRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  llvm::json::Object body;
-  const auto *arguments = request.getObject("arguments");
-  lldb::SBFrame frame = dap.GetLLDBFrame(*arguments);
-  std::string expression =
-      GetString(arguments, "expression").value_or("").str();
-  const llvm::StringRef context = GetString(arguments, "context").value_or("");
+/// Evaluates the given expression in the context of a stack frame.
+///
+/// The expression has access to any variables and arguments that are in scope.
+Expected<EvaluateResponseBody>
+EvaluateRequestHandler::Run(const EvaluateArguments &arguments) const {
+  EvaluateResponseBody body;
+  lldb::SBFrame frame = dap.GetLLDBFrame(arguments.frameId);
+  std::string expression = arguments.expression;
   bool repeat_last_command =
       expression.empty() && dap.last_nonempty_var_expression.empty();
 
-  if (context == "repl" &&
+  if (arguments.context == protocol::eEvaluateContextRepl &&
       (repeat_last_command ||
        (!expression.empty() &&
         dap.DetectReplMode(frame, expression, false) == ReplMode::Command))) {
@@ -165,70 +48,60 @@ void EvaluateRequestHandler::operator()(
     }
 
     bool required_command_failed = false;
-    std::string result = RunLLDBCommands(
+    body.result = RunLLDBCommands(
         dap.debugger, llvm::StringRef(), {expression}, required_command_failed,
         /*parse_command_directives=*/false, /*echo_commands=*/false);
+    return body;
+  }
 
-    EmplaceSafeString(body, "result", result);
-    body.try_emplace("variablesReference", (int64_t)0);
-  } else {
-    if (context == "repl") {
-      // If the expression is empty and the last expression was for a
-      // variable, set the expression to the previous expression (repeat the
-      // evaluation); otherwise save the current non-empty expression for the
-      // next (possibly empty) variable expression.
-      if (expression.empty())
-        expression = dap.last_nonempty_var_expression;
-      else
-        dap.last_nonempty_var_expression = expression;
-    }
-    // Always try to get the answer from the local variables if possible. If
-    // this fails, then if the context is not "hover", actually evaluate an
-    // expression using the expression parser.
-    //
-    // "frame variable" is more reliable than the expression parser in
-    // many cases and it is faster.
-    lldb::SBValue value = frame.GetValueForVariablePath(
-        expression.data(), lldb::eDynamicDontRunTarget);
-
-    // Freeze dry the value in case users expand it later in the debug console
-    if (value.GetError().Success() && context == "repl")
-      value = value.Persist();
-
-    if (value.GetError().Fail() && context != "hover")
-      value = frame.EvaluateExpression(expression.data());
-
-    if (value.GetError().Fail()) {
-      response["success"] = llvm::json::Value(false);
-      // This error object must live until we're done with the pointer returned
-      // by GetCString().
-      lldb::SBError error = value.GetError();
-      const char *error_cstr = error.GetCString();
-      if (error_cstr && error_cstr[0])
-        EmplaceSafeString(response, "message", error_cstr);
-      else
-        EmplaceSafeString(response, "message", "evaluate failed");
-    } else {
-      VariableDescription desc(value,
-                               dap.configuration.enableAutoVariableSummaries);
-      EmplaceSafeString(body, "result", desc.GetResult(context));
-      EmplaceSafeString(body, "type", desc.display_type_name);
-      int64_t var_ref = 0;
-      if (value.MightHaveChildren() || ValuePointsToCode(value))
-        var_ref = dap.variables.InsertVariable(
-            value, /*is_permanent=*/context == "repl");
-      if (value.MightHaveChildren())
-        body.try_emplace("variablesReference", var_ref);
-      else
-        body.try_emplace("variablesReference", (int64_t)0);
-      if (lldb::addr_t addr = value.GetLoadAddress();
-          addr != LLDB_INVALID_ADDRESS)
-        body.try_emplace("memoryReference", EncodeMemoryReference(addr));
-      if (ValuePointsToCode(value))
-        body.try_emplace("valueLocationReference", var_ref);
-    }
+  if (arguments.context == eEvaluateContextRepl) {
+    // If the expression is empty and the last expression was for a
+    // variable, set the expression to the previous expression (repeat the
+    // evaluation); otherwise save the current non-empty expression for the
+    // next (possibly empty) variable expression.
+    if (expression.empty())
+      expression = dap.last_nonempty_var_expression;
+    else
+      dap.last_nonempty_var_expression = expression;
   }
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  // Always try to get the answer from the local variables if possible. If
+  // this fails, then if the context is not "hover", actually evaluate an
+  // expression using the expression parser.
+  //
+  // "frame variable" is more reliable than the expression parser in
+  // many cases and it is faster.
+  lldb::SBValue value = frame.GetValueForVariablePath(
+      expression.data(), lldb::eDynamicDontRunTarget);
+
+  // Freeze dry the value in case users expand it later in the debug console
+  if (value.GetError().Success() && arguments.context == eEvaluateContextRepl)
+    value = value.Persist();
+
+  if (value.GetError().Fail() && arguments.context != eEvaluateContextHover)
+    value = frame.EvaluateExpression(expression.data());
+
+  if (value.GetError().Fail())
+    return ToError(value.GetError(), /*show_user=*/false);
+
+  VariableDescription desc(value,
+                           dap.configuration.enableAutoVariableSummaries);
+
+  body.result = desc.GetResult(arguments.context);
+  body.type = desc.display_type_name;
+
+  if (value.MightHaveChildren() || ValuePointsToCode(value))
+    body.variablesReference = dap.variables.InsertVariable(
+        value, /*is_permanent=*/arguments.context == eEvaluateContextRepl);
+
+  if (lldb::addr_t addr = value.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
+    body.memoryReference = EncodeMemoryReference(addr);
+
+  if (ValuePointsToCode(value) &&
+      body.variablesReference != LLDB_DAP_INVALID_VARRERF)
+    body.valueLocationReference = PackLocation(body.variablesReference, true);
+
+  return body;
 }
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index bc22133d92453..65a52075ebd79 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -292,11 +292,14 @@ class DisconnectRequestHandler
   Run(const std::optional<protocol::DisconnectArguments> &args) const override;
 };
 
-class EvaluateRequestHandler : public LegacyRequestHandler {
+class EvaluateRequestHandler
+    : public RequestHandler<protocol::EvaluateArguments,
+                            llvm::Expected<protocol::EvaluateResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "evaluate"; }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::EvaluateResponseBody>
+  Run(const protocol::EvaluateArguments &) const override;
   FeatureSet GetSupportedFeatures() const override {
     return {protocol::eAdapterFeatureEvaluateForHovers};
   }
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 1a3a6701b194d..81eadae03bb48 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -11,6 +11,7 @@
 #include "ExceptionBreakpoint.h"
 #include "LLDBUtils.h"
 #include "Protocol/ProtocolBase.h"
+#include "Protocol/ProtocolRequests.h"
 #include "ProtocolUtils.h"
 #include "lldb/API/SBAddress.h"
 #include "lldb/API/SBCompileUnit.h"
@@ -817,10 +818,10 @@ VariableDescription::VariableDescription(lldb::SBValue v,
   evaluate_name = llvm::StringRef(evaluateStream.GetData()).str();
 }
 
-std::string VariableDescription::GetResult(llvm::StringRef context) {
+std::string VariableDescription::GetResult(protocol::EvaluateContext context) {
   // In repl context, the results can be displayed as multiple lines so more
   // detailed descriptions can be returned.
-  if (context != "repl")
+  if (context != protocol::eEvaluateContextRepl)
     return display_value;
 
   if (!v.IsValid())
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 0c865a33a6ce4..329dc8ab02f99 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -10,7 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 
 #include "DAPForward.h"
-#include "Protocol/ProtocolTypes.h"
+#include "Protocol/ProtocolRequests.h"
 #include "lldb/API/SBCompileUnit.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBType.h"
@@ -28,7 +28,7 @@
 
 namespace lldb_dap {
 
-/// Emplace a StringRef in a json::Object after enusring that the
+/// Emplace a StringRef in a json::Object after ensuring that the
 /// string is valid UTF8. If not, first call llvm::json::fixUTF8
 /// before emplacing.
 ///
@@ -351,7 +351,7 @@ struct VariableDescription {
                       std::optional<std::string> custom_name = {});
 
   /// Returns a description of the value appropriate for the specified context.
-  std::string GetResult(llvm::StringRef context);
+  std::string GetResult(protocol::EvaluateContext context);
 };
 
 /// Does the given variable have an associated value location?
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 4db6caa1af38b..e2ba2ee64103d 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLDBUtils.h"
+#include "DAPError.h"
 #include "JSONUtils.h"
 #include "lldb/API/SBCommandInterpreter.h"
 #include "lldb/API/SBCommandReturnObject.h"
@@ -17,6 +18,7 @@
 #include "lldb/API/SBThread.h"
 #include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -214,13 +216,14 @@ GetStopDisassemblyDisplay(lldb::SBDebugger &debugger) {
   return result;
 }
 
-llvm::Error ToError(const lldb::SBError &error) {
+llvm::Error ToError(const lldb::SBError &error, bool show_user) {
   if (error.Success())
     return llvm::Error::success();
 
-  return llvm::createStringError(
-      std::error_code(error.GetError(), std::generic_category()),
-      error.GetCString());
+  return llvm::make_error<DAPError>(
+      /*message=*/error.GetCString(),
+      /*EC=*/std::error_code(error.GetError(), std::generic_category()),
+      /*show_user=*/show_user);
 }
 
 std::string GetStringValue(const lldb::SBStructuredData &data) {
diff --git a/lldb/tools/lldb-dap/LLDBUtils.h b/lldb/tools/lldb-dap/LLDBUtils.h
index 9db721a47ccf7..a29d3d88789a0 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.h
+++ b/lldb/tools/lldb-dap/LLDBUtils.h
@@ -243,7 +243,7 @@ class ScopeSyncMode {
 lldb::StopDisassemblyType GetStopDisassemblyDisplay(lldb::SBDebugger &debugger);
 
 /// Take ownership of the stored error.
-llvm::Error ToError(const lldb::SBError &error);
+llvm::Error ToError(const lldb::SBError &error, bool show_user = true);
 
 /// Provides the string value if this data structure is a string type.
 std::string GetStringValue(const lldb::SBStructuredData &data);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 44ae79f8b9f43..ac01cfb95dd41 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -8,6 +8,7 @@
 
 #include "Protocol/ProtocolRequests.h"
 #include "JSONUtils.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/lldb-defines.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
@@ -639,6 +640,54 @@ json::Value toJSON(const ExceptionInfoResponseBody &ERB) {
     result.insert({"description", ERB.description});
   if (ERB.details.has_value())
     result.insert({"details", *ERB.details});
+  return result;
+}
+
+static bool fromJSON(const llvm::json::Value &Params, EvaluateContext &C,
+                     llvm::json::Path P) {
+  auto rawContext = Params.getAsString();
+  if (!rawContext) {
+    P.report("expected a string");
+    return false;
+  }
+  C = StringSwitch<EvaluateContext>(*rawContext)
+          .Case("watch", EvaluateContext::eEvaluateContextWatch)
+          .Case("repl", EvaluateContext::eEvaluateContextRepl)
+          .Case("hover", EvaluateContext::eEvaluateContextHover)
+          .Case("clipboard", EvaluateContext::eEvaluateContextClipboard)
+          .Case("variables", EvaluateContext::eEvaluateContextVariables)
+          .Default(eEvaluateContextUnknown);
+  return true;
+}
+
+bool fromJSON(const llvm::json::Value &Params, EvaluateArguments &Args,
+              llvm::json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("expression", Args.expression) &&
+         O.mapOptional("frameId", Args.frameId) &&
+         O.mapOptional("line", Args.line) &&
+         O.mapOptional("column", Args.column) &&
+         O.mapOptional("source", Args.source) &&
+         O.mapOptional("context", Args.context) &&
+         O.mapOptional("format", Args.format);
+}
+
+llvm::json::Value toJSON(const EvaluateResponseBody &Body) {
+  json::Object result{{"result", Body.result},
+                      {"variablesReference", Body.variablesReference}};
+
+  if (!Body.type.empty())
+    result.insert({"type", Body.type});
+  if (Body.presentationHint)
+    result.insert({"presentationHint", Body.presentationHint});
+  if (Body.namedVariables)
+    result.insert({"namedVariables", Body.namedVariables});
+  if (Body.indexedVariables)
+    result.insert({"indexedVariables", Body.indexedVariables});
+  if (!Body.memoryReference.empty())
+    result.insert({"memoryReference", Body.memoryReference});
+  if (Body.valueLocationReference != LLDB_DAP_INVALID_VALUE_LOC)
+    result.insert({"valueLocationReference", Body.valueLocationReference});
 
   return result;
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index b894f2b4ed44d..c1e1e93f1e44a 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -1061,6 +1061,123 @@ struct ExceptionInfoResponseBody {
 };
 llvm::json::Value toJSON(const ExceptionInfoResponseBody &);
 
+/// The context in which the evaluate request is used.
+enum EvaluateContext : unsigned {
+  /// An unspecified or unknown evaluate context.
+  eEvaluateContextUnknown = 0,
+  /// 'watch': evaluate is called from a watch view context.
+  eEvaluateContextWatch = 1,
+  /// 'repl': evaluate is called from a REPL context.
+  eEvaluateContextRepl = 2,
+  /// 'hover': evaluate is called to generate the debug hover contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsEvaluateForHovers` is true.
+  eEvaluateContextHover = 3,
+  /// 'clipboard': evaluate is called to generate clipboard contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsClipboardContext` is true.
+  eEvaluateContextClipboard = 4,
+  /// 'variables': evaluate is called from a variables view context.
+  eEvaluateContextVariables = 5,
+};
+
+/// Arguments for `evaluate` request.
+struct EvaluateArguments {
+  /// The expression to evaluate.
+  std::string expression;
+
+  /// Evaluate the expression in the scope of this stack frame. If not
+  /// specified, the expression is evaluated in the global scope.
+  uint64_t frameId = LLDB_DAP_INVALID_FRAME_ID;
+
+  /// The contextual line where the expression should be evaluated. In the
+  /// 'hover' context, this should be set to the start of the expression being
+  /// hovered.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// The contextual column where the expression should be evaluated. This may
+  /// be provided if `line` is also provided.
+  ///
+  /// It is measured in UTF-16 code units and the client capability
+  /// `columnsStartAt1` determines whether it is 0- or 1-based.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The contextual source in which the `line` is found. This must be provided
+  /// if `line` is provided.
+  std::optional<Source> source;
+
+  /// The context in which the evaluate request is used.
+  /// Values:
+  /// 'watch': evaluate is called from a watch view context.
+  /// 'repl': evaluate is called from a REPL context.
+  /// 'hover': evaluate is called to generate the debug hover contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsEvaluateForHovers` is true.
+  /// 'clipboard': evaluate is called to generate clipboard contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsClipboardContext` is true.
+  /// 'variables': evaluate is called from a variables view context.
+  /// etc.
+  EvaluateContext context = eEvaluateContextUnknown;
+
+  /// Specifies details on how to format the result.
+  /// The attribute is only honored by a debug adapter if the corresponding
+  /// capability `supportsValueFormattingOptions` is true.
+  std::optional<ValueFormat> format;
+};
+bool fromJSON(const llvm::json::Value &, EvaluateArguments &, llvm::json::Path);
+
+/// Response to 'evaluate' request.
+struct EvaluateResponseBody {
+  /// The result of the evaluate request.
+  std::string result;
+
+  /// The type of the evaluate result.
+  /// This attribute should only be returned by a debug adapter if the
+  /// corresponding capability `supportsVariableType` is true.
+  std::string type;
+
+  /// Properties of an evaluate result that can be used to determine how to
+  /// render the result in the UI.
+  std::optional<VariablePresentationHint> presentationHint;
+
+  /// If `variablesReference` is > 0, the evaluate result is structured and its
+  /// children can be retrieved by passing `variablesReference` to the
+  /// `variables` request as long as execution remains suspended. See 'Lifetime
+  /// of Object References' in the Overview section for details.
+  int64_t variablesReference = 0;
+
+  /// The number of named child variables.
+  /// The client can use this information to present the variables in a paged
+  /// UI and fetch them in chunks.
+  /// The value should be less than or equal to 2147483647 (2^31-1).
+  uint32_t namedVariables = 0;
+
+  /// The number of indexed child variables.
+  /// The client can use this information to present the variables in a paged
+  /// UI and fetch them in chunks.
+  /// The value should be less than or equal to 2147483647 (2^31-1).
+  uint32_t indexedVariables = 0;
+
+  /// A memory reference to a location appropriate for this result.
+  /// For pointer type eval results, this is generally a reference to the
+  /// memory address contained in the pointer.
+  /// This attribute may be returned by a debug adapter if corresponding
+  /// capability `supportsMemoryReferences` is true.
+  std::string memoryReference;
+
+  /// A reference that allows the client to request the location where the
+  /// returned value is declared. For example, if a function pointer is
+  /// returned, the adapter may be able to look up the function's location.
+  /// This should be present only if the adapter is likely to be able to
+  /// resolve the location.
+  ///
+  /// This reference shares the same lifetime as the `variablesReference`. See
+  /// 'Lifetime of Object References' in the Overview section for details.
+  uint64_t valueLocationReference = LLDB_DAP_INVALID_VALUE_LOC;
+};
+llvm::json::Value toJSON(const EvaluateResponseBody &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index 6d85c74377bd3..690a1d684d0e9 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -28,8 +28,9 @@
 #include <optional>
 #include <string>
 
-#define LLDB_DAP_INVALID_VARRERF UINT64_MAX
+#define LLDB_DAP_INVALID_VARRERF INT64_MAX
 #define LLDB_DAP_INVALID_SRC_REF 0
+#define LLDB_DAP_INVALID_VALUE_LOC 0
 
 namespace lldb_dap::protocol {
 
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
index 498195dc09325..ba9aef1e5fcc5 100644
--- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -67,3 +67,54 @@ TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) {
   ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
   EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
 }
+
+TEST(ProtocolRequestsTest, EvaluateArguments) {
+  llvm::Expected<EvaluateArguments> expected = parse<EvaluateArguments>(R"({
+    "expression": "hello world",
+    "context": "repl"
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->expression, "hello world");
+  EXPECT_EQ(expected->context, eEvaluateContextRepl);
+
+  // Check required keys;
+  EXPECT_THAT_EXPECTED(parse<EvaluateArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).expression"));
+}
+
+TEST(ProtocolRequestsTest, EvaluateResponseBody) {
+  EvaluateResponseBody body;
+  body.result = "hello world";
+  body.variablesReference = 7;
+
+  // Check required keys.
+  Expected<json::Value> expected = parse(R"({
+    "result": "hello world",
+    "variablesReference": 7
+  })");
+
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
+
+  // Check optional keys.
+  body.result = "'abc'";
+  body.type = "string";
+  body.variablesReference = 42;
+  body.namedVariables = 1;
+  body.indexedVariables = 2;
+  body.memoryReference = "0x123";
+  body.valueLocationReference = 22;
+
+  Expected<json::Value> expected_opt = parse(R"({
+    "result": "'abc'",
+    "type": "string",
+    "variablesReference": 42,
+    "namedVariables": 1,
+    "indexedVariables": 2,
+    "memoryReference": "0x123",
+    "valueLocationReference": 22
+  })");
+
+  ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
+}

From c1c22cd3e16beb3937eb0d11da014451397be5d6 Mon Sep 17 00:00:00 2001
From: Amit Kumar Pandey <pandey.kumaramit2023@gmail.com>
Date: Tue, 18 Nov 2025 00:43:09 +0530
Subject: [PATCH 073/105] [ASan][HIP] Add ASan declarations and macros.
 (#167522)

This patch adds the following device ASan hooks and guarded macros in
__clang_hip_libdevice_declares.h

  - Function Declarations
    - __asan_poison_memory_region
    - __asan_unpoison_memory_region
    - __asan_address_is_poisoned
    - __asan_region_is_poisoned

  - Macros
    - ASAN_POISON_MEMORY_REGION
    - ASAN_UNPOISON_MEMORY_REGION
---
 .../Headers/__clang_hip_libdevice_declares.h    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
index fa8d918248dd0..fad9c6ca7ffc5 100644
--- a/clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -338,6 +338,23 @@ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
 
+__device__ void __asan_poison_memory_region(const void *addr,
+                                            __SIZE_TYPE__ size);
+__device__ void __asan_unpoison_memory_region(const void *addr,
+                                              __SIZE_TYPE__ size);
+__device__ int __asan_address_is_poisoned(const void *addr);
+__device__ void *__asan_region_is_poisoned(void *beg, __SIZE_TYPE__ size);
+
+#if __has_feature(address_sanitizer)
+#define ASAN_POISON_MEMORY_REGION(addr, size)                                  \
+  __asan_poison_memory_region((addr), (size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size)                                \
+  __asan_unpoison_memory_region((addr), (size))
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
 #ifdef __cplusplus
 } // extern "C"
 #endif

From 69b4190d5f1b483524f2f539f373960ef8de8d84 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96@gmail.com>
Date: Mon, 17 Nov 2025 21:39:10 +0200
Subject: [PATCH 074/105] [AArch64] Optimize extending loads of small vectors
 (#163064)

Reduces the total amount of loads and the amount of moves between SIMD
registers and general-purpose registers.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 148 +++++++---
 llvm/test/CodeGen/AArch64/aarch64-load-ext.ll | 264 ++++++++++++++++--
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    |  12 +-
 llvm/test/CodeGen/AArch64/add.ll              |  27 +-
 llvm/test/CodeGen/AArch64/andorxor.ll         |  81 +++---
 llvm/test/CodeGen/AArch64/bitcast.ll          |   6 +-
 llvm/test/CodeGen/AArch64/ctlz.ll             |  18 +-
 llvm/test/CodeGen/AArch64/ctpop.ll            |  18 +-
 llvm/test/CodeGen/AArch64/cttz.ll             |  16 +-
 llvm/test/CodeGen/AArch64/extbinopload.ll     |  26 +-
 llvm/test/CodeGen/AArch64/load.ll             |  11 +-
 llvm/test/CodeGen/AArch64/mul.ll              |  27 +-
 llvm/test/CodeGen/AArch64/sadd_sat_vec.ll     |  22 +-
 llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll    |   8 +-
 llvm/test/CodeGen/AArch64/ssub_sat_vec.ll     |  22 +-
 llvm/test/CodeGen/AArch64/store.ll            |   7 +-
 llvm/test/CodeGen/AArch64/sub.ll              |  27 +-
 .../AArch64/sve-fixed-length-ext-loads.ll     |   8 +-
 .../AArch64/sve-fixed-length-masked-gather.ll |  13 +-
 .../sve-fixed-length-masked-scatter.ll        |  13 +-
 llvm/test/CodeGen/AArch64/uadd_sat_vec.ll     |  26 +-
 llvm/test/CodeGen/AArch64/usub_sat_vec.ll     |  26 +-
 llvm/test/CodeGen/AArch64/v3f-to-int.ll       |  15 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll |   7 +-
 .../AArch64/vec3-loads-ext-trunc-stores.ll    |  67 +++--
 25 files changed, 567 insertions(+), 348 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 35836af3c874b..42567883b2594 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1427,12 +1427,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
 
     // ADDP custom lowering
     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6728,8 +6740,34 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
   return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
 }
 
+/// Helper function to check if a small vector load can be optimized.
+static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD,
+                                            const AArch64Subtarget &Subtarget) {
+  if (!Subtarget.isNeonAvailable())
+    return false;
+  if (LD->isVolatile())
+    return false;
+
+  EVT MemVT = LD->getMemoryVT();
+  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
+    return false;
+
+  Align Alignment = LD->getAlign();
+  Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
+  if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
+    return false;
+
+  return true;
+}
+
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT ExtVT = ExtVal.getValueType();
+  // Small, illegal vectors can be extended inreg.
+  if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
+    if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
+        isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
+      return true;
+  }
   if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
     return false;
 
@@ -7188,12 +7226,86 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   return Result;
 }
 
+/// Helper function to optimize loads of extended small vectors.
+/// These patterns would otherwise get scalarized into inefficient sequences.
+static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG) {
+  const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
+    return SDValue();
+
+  EVT MemVT = Load->getMemoryVT();
+  EVT ResVT = Load->getValueType(0);
+  unsigned NumElts = ResVT.getVectorNumElements();
+  unsigned DstEltBits = ResVT.getScalarSizeInBits();
+  unsigned SrcEltBits = MemVT.getScalarSizeInBits();
+
+  unsigned ExtOpcode;
+  switch (Load->getExtensionType()) {
+  case ISD::EXTLOAD:
+  case ISD::ZEXTLOAD:
+    ExtOpcode = ISD::ZERO_EXTEND;
+    break;
+  case ISD::SEXTLOAD:
+    ExtOpcode = ISD::SIGN_EXTEND;
+    break;
+  case ISD::NON_EXTLOAD:
+    return SDValue();
+  }
+
+  SDLoc DL(Load);
+  SDValue Chain = Load->getChain();
+  SDValue BasePtr = Load->getBasePtr();
+  const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
+  Align Alignment = Load->getAlign();
+
+  // Load the data as an FP scalar to avoid issues with integer loads.
+  unsigned LoadBits = MemVT.getStoreSizeInBits();
+  MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
+  SDValue ScalarLoad =
+      DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
+
+  MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
+  SDValue ScalarToVec =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
+  MVT BitcastTy =
+      MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
+
+  SDValue Res = Bitcast;
+  unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
+  unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
+  while (CurrentEltBits < DstEltBits) {
+    if (Res.getValueSizeInBits() >= 128) {
+      CurrentNumElts = CurrentNumElts / 2;
+      MVT ExtractVT =
+          MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
+                        DAG.getConstant(0, DL, MVT::i64));
+    }
+    CurrentEltBits = CurrentEltBits * 2;
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+    Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
+  }
+
+  if (CurrentNumElts != NumElts) {
+    MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
+                      DAG.getConstant(0, DL, MVT::i64));
+  }
+
+  return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
+}
+
 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
                                          SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
 
+  if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
+    return Result;
+
   if (LoadNode->getMemoryVT() == MVT::i64x8) {
     SmallVector<SDValue, 8> Ops;
     SDValue Base = LoadNode->getBasePtr();
@@ -7212,37 +7324,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
     return DAG.getMergeValues({Loaded, Chain}, DL);
   }
 
-  // Custom lowering for extending v4i8 vector loads.
-  EVT VT = Op->getValueType(0);
-  assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
-
-  if (LoadNode->getMemoryVT() != MVT::v4i8)
-    return SDValue();
-
-  // Avoid generating unaligned loads.
-  if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
-    return SDValue();
-
-  unsigned ExtType;
-  if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
-    ExtType = ISD::SIGN_EXTEND;
-  else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
-           LoadNode->getExtensionType() == ISD::EXTLOAD)
-    ExtType = ISD::ZERO_EXTEND;
-  else
-    return SDValue();
-
-  SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
-                             LoadNode->getBasePtr(), MachinePointerInfo());
-  SDValue Chain = Load.getValue(1);
-  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
-  SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
-  SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
-  Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
-                    DAG.getConstant(0, DL, MVT::i64));
-  if (VT == MVT::v4i32)
-    Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
-  return DAG.getMergeValues({Ext, Chain}, DL);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad0..0ef2b31d00daa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) {
 define <2 x i16> @test1(ptr %v2i16_ptr) {
 ; CHECK-LE-LABEL: test1:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT:    add x8, x0, #2
-; CHECK-LE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test1:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT:    add x8, x0, #2
-; CHECK-BE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) {
 define <2 x i8> @test3(ptr %v2i8_ptr) {
 ; CHECK-LE-LABEL: test3:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT:    add x8, x0, #1
-; CHECK-LE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test3:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT:    add x8, x0, #1
-; CHECK-BE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) {
 define <2 x i32> @fsext_v2i32(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i32:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ldrsb w8, [x0]
-; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i32:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldrsb w8, [x0]
-; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) {
 define <2 x i16> @fsext_v2i16(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i16:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ldrsb w8, [x0]
-; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i16:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldrsb w8, [x0]
-; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -497,3 +495,213 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric
   %v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
   ret <4 x i8> %v4i8
 }
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i16>
+  ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = zext <2 x i16> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = zext <2 x i16> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <4 x i32> @zext_v4i16_v4i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v4i16_v4i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v4i16_v4i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <4 x i16>, ptr %a
+  %y = zext <4 x i16> %x to <4 x i32>
+  ret <4 x i32> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = sext <2 x i8> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = sext <2 x i16> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = sext <2 x i16> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <4 x i32> @sext_v4i16_v4i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v4i16_v4i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v4i16_v4i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <4 x i16>, ptr %a
+  %y = sext <4 x i16> %x to <4 x i32>
+  ret <4 x i32> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index e85e808921c87..a302ddf483caa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -219,21 +219,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
-; CHECK-NEON-NEXT:    ldr h0, [x0]
+; CHECK-NEON-NEXT:    ldr s0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
-; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
-; CHECK-SVE-NEXT:    ldr h0, [x0]
+; CHECK-SVE-NEXT:    ldr s0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    mov v0.d[1], x8
-; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index 96168cb80196f..7502db4c5aa93 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -56,13 +56,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -228,13 +225,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index a7875dbebd0e6..d8d003c85eed6 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -176,12 +176,12 @@ entry:
 define void @and_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: and_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v0.4h
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -212,12 +212,12 @@ entry:
 define void @or_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: or_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -248,12 +248,12 @@ entry:
 define void @xor_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: xor_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -698,12 +695,10 @@ entry:
 define void @and_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: and_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
@@ -734,12 +729,10 @@ entry:
 define void @or_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: or_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
@@ -770,12 +763,10 @@ entry:
 define void @xor_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: xor_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 20f19fddf790a..002e6cd509bec 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    add x8, sp, #12
 ; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT:    str s0, [sp, #12]
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x8]
-; CHECK-SD-NEXT:    orr x8, x8, #0x2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 04124609eec74..b1b869ec9e1ff 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -6,11 +6,10 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
+; CHECK-SD-NEXT:    ldr h1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2s, #24
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v1.s[1], w9
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    clz v1.2s, v1.2s
 ; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
@@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -145,11 +143,9 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SD-NEXT:    ldr s1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2s, #16
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v1.s[1], w9
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    clz v1.2s, v1.2s
 ; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index d547b6bec5b83..9c59f1b233b5d 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -6,10 +6,9 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
@@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -143,10 +141,8 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index fc9bf2c0aca65..c9181b4c312d1 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -6,10 +6,10 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    ldr h0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2s, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #8
 ; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
@@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -219,10 +218,9 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    ldr s0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2s, #1
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #16
 ; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7278e40..d18cff51c6101 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) {
 define <2 x i16> @std_v2i8_v2i16(ptr %p) {
 ; CHECK-LABEL: std_v2i8_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrb w9, [x0, #3]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    mov v1.s[1], w9
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #3
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %l1 = load <2 x i8>, ptr %p
   %q = getelementptr i8, ptr %p, i32 2
@@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x0, #4]
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ldr s0, [x0, #4]
+; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
-; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %l1b = load volatile float, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e37d6eaf..b138fa4085427 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) {
 define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
 ; CHECK-SD-LABEL: load_v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) {
 define <2 x i16> @load_v2i16(ptr %ptr) {
 ; CHECK-SD-LABEL: load_v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9c69a6f03b858..475bd22c6ebcb 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -68,13 +68,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -240,13 +237,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 1c4a504d0ab70..b31a5ea0b5d79 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
index 3e708b0678fbc..297b25ed075e4 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
@@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB3_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x0, x8, lsl #1
-; CHECK-NEXT:    ldrsb w10, [x9]
-; CHECK-NEXT:    ldrsb w9, [x9, #1]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ldr h0, [x0, x8, lsl #1]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    str q0, [x1, x8, lsl #4]
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 3af858713525b..02eb40b412efd 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 3a9f12b838702..1dc55fccc3dac 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    strb w2, [x3, #2]
 ; CHECK-SD-NEXT:    mov v0.h[1], w1
 ; CHECK-SD-NEXT:    mov v0.h[2], w2
 ; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
-; CHECK-SD-NEXT:    str s0, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
-; CHECK-SD-NEXT:    strb w2, [x3, #2]
-; CHECK-SD-NEXT:    strh w8, [x3]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    str h0, [x3]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 5e278d59b6591..dd920b98e18eb 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -56,13 +56,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    usubl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -228,13 +225,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    usubl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index ba7bee9a94bac..a77c74ab67b80 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -7,8 +7,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_zext_v4i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
@@ -97,8 +99,10 @@ define void @load_zext_v64i16i32(ptr %ap, ptr %b) #0 {
 define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_sext_v4i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = sext <4 x i16> %a to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 6fd5b820a2242..b457e0307fbe1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
@@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ed03f9b322432..4fb3bf7392d4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
@@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 3cfb24aaccb11..cd02d18e61643 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x1]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
 ; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT:    ldrb w10, [x0, #1]
-; CHECK-SD-NEXT:    ldrb w11, [x1, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x2]
@@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
 ; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
-; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a71cf95a728db..ef70137e6deee 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x1]
-; CHECK-SD-NEXT:    ldrb w10, [x0, #1]
-; CHECK-SD-NEXT:    ldrb w11, [x1, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x2]
@@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x1]
-; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
-; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index f6553b6acec9d..6d4061fb02cff 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
-; CHECK-LABEL: convert_v3f32
-; CHECK: strb
-; CHECK: strh
 define void @convert_v3f32() {
+; CHECK-LABEL: convert_v3f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    strb wzr, [x8]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    str h0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 entry:
   br label %bb
 
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 80029fb717575..ee74984125f77 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-SD-NEXT:    shl.16b v0, v0, #7
 ; CHECK-SD-NEXT:    adrp x8, lCPI20_0@PAGE
 ; CHECK-SD-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; CHECK-SD-NEXT:    add x8, sp, #14
 ; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
 ; CHECK-SD-NEXT:    and.16b v0, v0, v1
 ; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
 ; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
 ; CHECK-SD-NEXT:    addv.8h h0, v0
-; CHECK-SD-NEXT:    str h0, [sp, #14]
-; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-SD-NEXT:    orr x8, x8, #0x1
-; CHECK-SD-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT:    ushll.8h v0, v0, #0
+; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-SD-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 7d3f5bc270d6b..60414adba75fc 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    ldr s0, [x0]
 ; BE-NEXT:    ldrh w8, [x0, #4]
 ; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    strb w8, [x1, #2]
 ; BE-NEXT:    mov v0.h[2], w8
 ; BE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    str s0, [sp, #12]
-; BE-NEXT:    ldrh w9, [sp, #12]
-; BE-NEXT:    strb w8, [x1, #2]
-; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    str h0, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
 entry:
@@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
 entry:
@@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #3]
-; BE-NEXT:    sturh w8, [x1, #1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    stur h1, [x1, #1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #5]
-; BE-NEXT:    sturh w8, [x1, #3]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    stur h1, [x1, #3]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    shrn.4h v0, v0, #16
 ; CHECK-NEXT:    uzp1.8b v1, v0, v0
 ; CHECK-NEXT:    mov h0, v0[2]
-; CHECK-NEXT:    str s1, [sp, #12]
-; CHECK-NEXT:    ldrh w8, [sp, #12]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    stur b0, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    str h1, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
@@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #8]
-; BE-NEXT:    ldrh w8, [sp, #8]
 ; BE-NEXT:    stur b0, [x0, #2]
-; BE-NEXT:    strh w8, [x0]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x0]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i8>, ptr %src, align 1
@@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #8]
-; BE-NEXT:    ldrh w8, [sp, #8]
 ; BE-NEXT:    stur b0, [x0, #2]
-; BE-NEXT:    strh w8, [x0]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x0]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i8>, ptr %src, align 1

From 21e0b56d7afc2f1af0ad5b728fcc039bfe1d37ff Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 17 Nov 2025 19:47:36 +0000
Subject: [PATCH 075/105] [AArch64][GlobalISel] Add basic GISel test coverage
 for lround and llround. NFC

---
 .../test/CodeGen/AArch64/llround-conv-fp16.ll |  8 ++-
 llvm/test/CodeGen/AArch64/llround-conv.ll     | 67 ++++++++++++-------
 llvm/test/CodeGen/AArch64/lround-conv-fp16.ll |  8 ++-
 llvm/test/CodeGen/AArch64/lround-conv.ll      | 55 +++++++++------
 4 files changed, 88 insertions(+), 50 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
index 4bf65e7d6fd08..cb042757a4a42 100644
--- a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmhhs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhws
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhxs
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
@@ -55,5 +61,3 @@ entry:
   %0 = tail call i64 @llvm.llround.i64.f16(half %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llround.i64.f16(half) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/llround-conv.ll b/llvm/test/CodeGen/AArch64/llround-conv.ll
index 797136037f0e9..4cc089804ce97 100644
--- a/llvm/test/CodeGen/AArch64/llround-conv.ll
+++ b/llvm/test/CodeGen/AArch64/llround-conv.ll
@@ -1,60 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmswl
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmsll
 
-; CHECK-LABEL: testmsws:
-; CHECK:       fcvtas  x0, s0
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       fcvtas  x0, s0
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       fcvtas  x0, d0
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       fcvtas  x0, d0
-; CHECK-NEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      llroundl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl llroundl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f128(fp128 %x)
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       llroundl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b llroundl
 entry:
-  %0 = tail call i64 @llvm.llround.f128(fp128 %x)
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
-declare i64 @llvm.llround.f128(fp128) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
index bf78fd456eac0..a29dea0eb9f9f 100644
--- a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmhhs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhws
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhxs
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
@@ -55,5 +61,3 @@ entry:
   %0 = tail call i64 @llvm.lround.i64.f16(half %x)
   ret i64 %0
 }
-
-declare i64 @llvm.lround.i64.f16(half) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/lround-conv.ll b/llvm/test/CodeGen/AArch64/lround-conv.ll
index 678d3149f20cc..0bf82b538e70c 100644
--- a/llvm/test/CodeGen/AArch64/lround-conv.ll
+++ b/llvm/test/CodeGen/AArch64/lround-conv.ll
@@ -1,60 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmswl
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmsll
 
-; CHECK-LABEL: testmsws:
-; CHECK:       fcvtas  x0, s0
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       fcvtas  x0, s0
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       fcvtas  x0, d0
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       fcvtas  x0, d0
-; CHECK-NEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      lroundl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl lroundl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       lroundl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b lroundl
 entry:
   %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.lround.i64.f32(float) nounwind readnone
-declare i64 @llvm.lround.i64.f64(double) nounwind readnone
-declare i64 @llvm.lround.i64.f128(fp128) nounwind readnone

From 320c18a066b29e90ab5f3ef33b6c510f28edeb80 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Mon, 17 Nov 2025 23:03:45 +0300
Subject: [PATCH 076/105] [SystemZ] TableGen-erate node descriptions (#168113)

This allows SDNodes to be validated against their expected type profiles
and reduces the number of changes required to add a new node.

There is only one node that is missing a description -- `GET_CCMASK`,
others were successfully imported.

Part of #119709.

Pull Request: https://github.com/llvm/llvm-project/pull/168113
---
 llvm/lib/Target/SystemZ/CMakeLists.txt        |   1 +
 .../Target/SystemZ/SystemZISelLowering.cpp    | 147 -------
 llvm/lib/Target/SystemZ/SystemZISelLowering.h | 386 ------------------
 llvm/lib/Target/SystemZ/SystemZOperators.td   | 279 ++++++++++++-
 .../SystemZ/SystemZSelectionDAGInfo.cpp       |  20 +-
 .../Target/SystemZ/SystemZSelectionDAGInfo.h  |  29 +-
 6 files changed, 295 insertions(+), 567 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/CMakeLists.txt b/llvm/lib/Target/SystemZ/CMakeLists.txt
index 0d8f3eac6ee4f..6d94a755322df 100644
--- a/llvm/lib/Target/SystemZ/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM SystemZGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM SystemZGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(SystemZCommonTableGen)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 58109acc92015..dfd76f9b0427f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7423,153 +7423,6 @@ SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
   return LowerOperationWrapper(N, Results, DAG);
 }
 
-const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
-#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
-  switch ((SystemZISD::NodeType)Opcode) {
-    case SystemZISD::FIRST_NUMBER: break;
-    OPCODE(RET_GLUE);
-    OPCODE(CALL);
-    OPCODE(SIBCALL);
-    OPCODE(TLS_GDCALL);
-    OPCODE(TLS_LDCALL);
-    OPCODE(PCREL_WRAPPER);
-    OPCODE(PCREL_OFFSET);
-    OPCODE(ICMP);
-    OPCODE(FCMP);
-    OPCODE(STRICT_FCMP);
-    OPCODE(STRICT_FCMPS);
-    OPCODE(TM);
-    OPCODE(BR_CCMASK);
-    OPCODE(SELECT_CCMASK);
-    OPCODE(ADJDYNALLOC);
-    OPCODE(PROBED_ALLOCA);
-    OPCODE(POPCNT);
-    OPCODE(SMUL_LOHI);
-    OPCODE(UMUL_LOHI);
-    OPCODE(SDIVREM);
-    OPCODE(UDIVREM);
-    OPCODE(SADDO);
-    OPCODE(SSUBO);
-    OPCODE(UADDO);
-    OPCODE(USUBO);
-    OPCODE(ADDCARRY);
-    OPCODE(SUBCARRY);
-    OPCODE(GET_CCMASK);
-    OPCODE(MVC);
-    OPCODE(NC);
-    OPCODE(OC);
-    OPCODE(XC);
-    OPCODE(CLC);
-    OPCODE(MEMSET_MVC);
-    OPCODE(STPCPY);
-    OPCODE(STRCMP);
-    OPCODE(SEARCH_STRING);
-    OPCODE(IPM);
-    OPCODE(TBEGIN);
-    OPCODE(TBEGIN_NOFLOAT);
-    OPCODE(TEND);
-    OPCODE(BYTE_MASK);
-    OPCODE(ROTATE_MASK);
-    OPCODE(REPLICATE);
-    OPCODE(JOIN_DWORDS);
-    OPCODE(SPLAT);
-    OPCODE(MERGE_HIGH);
-    OPCODE(MERGE_LOW);
-    OPCODE(SHL_DOUBLE);
-    OPCODE(PERMUTE_DWORDS);
-    OPCODE(PERMUTE);
-    OPCODE(PACK);
-    OPCODE(PACKS_CC);
-    OPCODE(PACKLS_CC);
-    OPCODE(UNPACK_HIGH);
-    OPCODE(UNPACKL_HIGH);
-    OPCODE(UNPACK_LOW);
-    OPCODE(UNPACKL_LOW);
-    OPCODE(VSHL_BY_SCALAR);
-    OPCODE(VSRL_BY_SCALAR);
-    OPCODE(VSRA_BY_SCALAR);
-    OPCODE(VROTL_BY_SCALAR);
-    OPCODE(SHL_DOUBLE_BIT);
-    OPCODE(SHR_DOUBLE_BIT);
-    OPCODE(VSUM);
-    OPCODE(VACC);
-    OPCODE(VSCBI);
-    OPCODE(VAC);
-    OPCODE(VSBI);
-    OPCODE(VACCC);
-    OPCODE(VSBCBI);
-    OPCODE(VMAH);
-    OPCODE(VMALH);
-    OPCODE(VME);
-    OPCODE(VMLE);
-    OPCODE(VMO);
-    OPCODE(VMLO);
-    OPCODE(VICMPE);
-    OPCODE(VICMPH);
-    OPCODE(VICMPHL);
-    OPCODE(VICMPES);
-    OPCODE(VICMPHS);
-    OPCODE(VICMPHLS);
-    OPCODE(VFCMPE);
-    OPCODE(STRICT_VFCMPE);
-    OPCODE(STRICT_VFCMPES);
-    OPCODE(VFCMPH);
-    OPCODE(STRICT_VFCMPH);
-    OPCODE(STRICT_VFCMPHS);
-    OPCODE(VFCMPHE);
-    OPCODE(STRICT_VFCMPHE);
-    OPCODE(STRICT_VFCMPHES);
-    OPCODE(VFCMPES);
-    OPCODE(VFCMPHS);
-    OPCODE(VFCMPHES);
-    OPCODE(VFTCI);
-    OPCODE(VEXTEND);
-    OPCODE(STRICT_VEXTEND);
-    OPCODE(VROUND);
-    OPCODE(STRICT_VROUND);
-    OPCODE(VTM);
-    OPCODE(SCMP128HI);
-    OPCODE(UCMP128HI);
-    OPCODE(VFAE_CC);
-    OPCODE(VFAEZ_CC);
-    OPCODE(VFEE_CC);
-    OPCODE(VFEEZ_CC);
-    OPCODE(VFENE_CC);
-    OPCODE(VFENEZ_CC);
-    OPCODE(VISTR_CC);
-    OPCODE(VSTRC_CC);
-    OPCODE(VSTRCZ_CC);
-    OPCODE(VSTRS_CC);
-    OPCODE(VSTRSZ_CC);
-    OPCODE(TDC);
-    OPCODE(ATOMIC_SWAPW);
-    OPCODE(ATOMIC_LOADW_ADD);
-    OPCODE(ATOMIC_LOADW_SUB);
-    OPCODE(ATOMIC_LOADW_AND);
-    OPCODE(ATOMIC_LOADW_OR);
-    OPCODE(ATOMIC_LOADW_XOR);
-    OPCODE(ATOMIC_LOADW_NAND);
-    OPCODE(ATOMIC_LOADW_MIN);
-    OPCODE(ATOMIC_LOADW_MAX);
-    OPCODE(ATOMIC_LOADW_UMIN);
-    OPCODE(ATOMIC_LOADW_UMAX);
-    OPCODE(ATOMIC_CMP_SWAPW);
-    OPCODE(ATOMIC_CMP_SWAP);
-    OPCODE(ATOMIC_LOAD_128);
-    OPCODE(ATOMIC_STORE_128);
-    OPCODE(ATOMIC_CMP_SWAP_128);
-    OPCODE(LRV);
-    OPCODE(STRV);
-    OPCODE(VLER);
-    OPCODE(VSTER);
-    OPCODE(STCKF);
-    OPCODE(PREFETCH);
-    OPCODE(ADA_ENTRY);
-  }
-  return nullptr;
-#undef OPCODE
-}
-
 // Return true if VT is a vector whose elements are a whole number of bytes
 // in width. Also check for presence of vector support.
 bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index d5b76031766dd..13a1cd1614a53 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -22,390 +22,6 @@
 #include <optional>
 
 namespace llvm {
-namespace SystemZISD {
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-  // Return with a glue operand.  Operand 0 is the chain operand.
-  RET_GLUE,
-
-  // Calls a function.  Operand 0 is the chain operand and operand 1
-  // is the target address.  The arguments start at operand 2.
-  // There is an optional glue operand at the end.
-  CALL,
-  SIBCALL,
-
-  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
-  // (The call target is implicitly __tls_get_offset.)
-  TLS_GDCALL,
-  TLS_LDCALL,
-
-  // Wraps a TargetGlobalAddress that should be loaded using PC-relative
-  // accesses (LARL).  Operand 0 is the address.
-  PCREL_WRAPPER,
-
-  // Used in cases where an offset is applied to a TargetGlobalAddress.
-  // Operand 0 is the full TargetGlobalAddress and operand 1 is a
-  // PCREL_WRAPPER for an anchor point.  This is used so that we can
-  // cheaply refer to either the full address or the anchor point
-  // as a register base.
-  PCREL_OFFSET,
-
-  // Integer comparisons.  There are three operands: the two values
-  // to compare, and an integer of type SystemZICMP.
-  ICMP,
-
-  // Floating-point comparisons.  The two operands are the values to compare.
-  FCMP,
-
-  // Test under mask.  The first operand is ANDed with the second operand
-  // and the condition codes are set on the result.  The third operand is
-  // a boolean that is true if the condition codes need to distinguish
-  // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
-  // register forms do but the memory forms don't).
-  TM,
-
-  // Branches if a condition is true.  Operand 0 is the chain operand;
-  // operand 1 is the 4-bit condition-code mask, with bit N in
-  // big-endian order meaning "branch if CC=N"; operand 2 is the
-  // target block and operand 3 is the flag operand.
-  BR_CCMASK,
-
-  // Selects between operand 0 and operand 1.  Operand 2 is the
-  // mask of condition-code values for which operand 0 should be
-  // chosen over operand 1; it has the same form as BR_CCMASK.
-  // Operand 3 is the flag operand.
-  SELECT_CCMASK,
-
-  // Evaluates to the gap between the stack pointer and the
-  // base of the dynamically-allocatable area.
-  ADJDYNALLOC,
-
-  // For allocating stack space when using stack clash protector.
-  // Allocation is performed by block, and each block is probed.
-  PROBED_ALLOCA,
-
-  // Count number of bits set in operand 0 per byte.
-  POPCNT,
-
-  // Wrappers around the ISD opcodes of the same name.  The output is GR128.
-  // Input operands may be GR64 or GR32, depending on the instruction.
-  SMUL_LOHI,
-  UMUL_LOHI,
-  SDIVREM,
-  UDIVREM,
-
-  // Add/subtract with overflow/carry.  These have the same operands as
-  // the corresponding standard operations, except with the carry flag
-  // replaced by a condition code value.
-  SADDO, SSUBO, UADDO, USUBO, ADDCARRY, SUBCARRY,
-
-  // Set the condition code from a boolean value in operand 0.
-  // Operand 1 is a mask of all condition-code values that may result of this
-  // operation, operand 2 is a mask of condition-code values that may result
-  // if the boolean is true.
-  // Note that this operation is always optimized away, we will never
-  // generate any code for it.
-  GET_CCMASK,
-
-  // Use a series of MVCs to copy bytes from one memory location to another.
-  // The operands are:
-  // - the target address
-  // - the source address
-  // - the constant length
-  //
-  // This isn't a memory opcode because we'd need to attach two
-  // MachineMemOperands rather than one.
-  MVC,
-
-  // Similar to MVC, but for logic operations (AND, OR, XOR).
-  NC,
-  OC,
-  XC,
-
-  // Use CLC to compare two blocks of memory, with the same comments
-  // as for MVC.
-  CLC,
-
-  // Use MVC to set a block of memory after storing the first byte.
-  MEMSET_MVC,
-
-  // Use an MVST-based sequence to implement stpcpy().
-  STPCPY,
-
-  // Use a CLST-based sequence to implement strcmp().  The two input operands
-  // are the addresses of the strings to compare.
-  STRCMP,
-
-  // Use an SRST-based sequence to search a block of memory.  The first
-  // operand is the end address, the second is the start, and the third
-  // is the character to search for.  CC is set to 1 on success and 2
-  // on failure.
-  SEARCH_STRING,
-
-  // Store the CC value in bits 29 and 28 of an integer.
-  IPM,
-
-  // Transaction begin.  The first operand is the chain, the second
-  // the TDB pointer, and the third the immediate control field.
-  // Returns CC value and chain.
-  TBEGIN,
-  TBEGIN_NOFLOAT,
-
-  // Transaction end.  Just the chain operand.  Returns CC value and chain.
-  TEND,
-
-  // Create a vector constant by filling byte N of the result with bit
-  // 15-N of the single operand.
-  BYTE_MASK,
-
-  // Create a vector constant by replicating an element-sized RISBG-style mask.
-  // The first operand specifies the starting set bit and the second operand
-  // specifies the ending set bit.  Both operands count from the MSB of the
-  // element.
-  ROTATE_MASK,
-
-  // Replicate a GPR scalar value into all elements of a vector.
-  REPLICATE,
-
-  // Create a vector from two i64 GPRs.
-  JOIN_DWORDS,
-
-  // Replicate one element of a vector into all elements.  The first operand
-  // is the vector and the second is the index of the element to replicate.
-  SPLAT,
-
-  // Interleave elements from the high half of operand 0 and the high half
-  // of operand 1.
-  MERGE_HIGH,
-
-  // Likewise for the low halves.
-  MERGE_LOW,
-
-  // Concatenate the vectors in the first two operands, shift them left
-  // by the third operand, and take the first half of the result.
-  SHL_DOUBLE,
-
-  // Take one element of the first v2i64 operand and the one element of
-  // the second v2i64 operand and concatenate them to form a v2i64 result.
-  // The third operand is a 4-bit value of the form 0A0B, where A and B
-  // are the element selectors for the first operand and second operands
-  // respectively.
-  PERMUTE_DWORDS,
-
-  // Perform a general vector permute on vector operands 0 and 1.
-  // Each byte of operand 2 controls the corresponding byte of the result,
-  // in the same way as a byte-level VECTOR_SHUFFLE mask.
-  PERMUTE,
-
-  // Pack vector operands 0 and 1 into a single vector with half-sized elements.
-  PACK,
-
-  // Likewise, but saturate the result and set CC.  PACKS_CC does signed
-  // saturation and PACKLS_CC does unsigned saturation.
-  PACKS_CC,
-  PACKLS_CC,
-
-  // Unpack the first half of vector operand 0 into double-sized elements.
-  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
-  UNPACK_HIGH,
-  UNPACKL_HIGH,
-
-  // Likewise for the second half.
-  UNPACK_LOW,
-  UNPACKL_LOW,
-
-  // Shift/rotate each element of vector operand 0 by the number of bits
-  // specified by scalar operand 1.
-  VSHL_BY_SCALAR,
-  VSRL_BY_SCALAR,
-  VSRA_BY_SCALAR,
-  VROTL_BY_SCALAR,
-
-  // Concatenate the vectors in the first two operands, shift them left/right
-  // bitwise by the third operand, and take the first/last half of the result.
-  SHL_DOUBLE_BIT,
-  SHR_DOUBLE_BIT,
-
-  // For each element of the output type, sum across all sub-elements of
-  // operand 0 belonging to the corresponding element, and add in the
-  // rightmost sub-element of the corresponding element of operand 1.
-  VSUM,
-
-  // Compute carry/borrow indication for add/subtract.
-  VACC, VSCBI,
-  // Add/subtract with carry/borrow.
-  VAC, VSBI,
-  // Compute carry/borrow indication for add/subtract with carry/borrow.
-  VACCC, VSBCBI,
-
-  // High-word multiply-and-add.
-  VMAH, VMALH,
-  // Widen and multiply even/odd vector elements.
-  VME, VMLE, VMO, VMLO,
-
-  // Compare integer vector operands 0 and 1 to produce the usual 0/-1
-  // vector result.  VICMPE is for equality, VICMPH for "signed greater than"
-  // and VICMPHL for "unsigned greater than".
-  VICMPE,
-  VICMPH,
-  VICMPHL,
-
-  // Likewise, but also set the condition codes on the result.
-  VICMPES,
-  VICMPHS,
-  VICMPHLS,
-
-  // Compare floating-point vector operands 0 and 1 to produce the usual 0/-1
-  // vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
-  // greater than" and VFCMPHE for "ordered and greater than or equal to".
-  VFCMPE,
-  VFCMPH,
-  VFCMPHE,
-
-  // Likewise, but also set the condition codes on the result.
-  VFCMPES,
-  VFCMPHS,
-  VFCMPHES,
-
-  // Test floating-point data class for vectors.
-  VFTCI,
-
-  // Extend the even f32 elements of vector operand 0 to produce a vector
-  // of f64 elements.
-  VEXTEND,
-
-  // Round the f64 elements of vector operand 0 to f32s and store them in the
-  // even elements of the result.
-  VROUND,
-
-  // AND the two vector operands together and set CC based on the result.
-  VTM,
-
-  // i128 high integer comparisons.
-  SCMP128HI,
-  UCMP128HI,
-
-  // String operations that set CC as a side-effect.
-  VFAE_CC,
-  VFAEZ_CC,
-  VFEE_CC,
-  VFEEZ_CC,
-  VFENE_CC,
-  VFENEZ_CC,
-  VISTR_CC,
-  VSTRC_CC,
-  VSTRCZ_CC,
-  VSTRS_CC,
-  VSTRSZ_CC,
-
-  // Test Data Class.
-  //
-  // Operand 0: the value to test
-  // Operand 1: the bit mask
-  TDC,
-
-  // z/OS XPLINK ADA Entry
-  // Wraps a TargetGlobalAddress that should be loaded from a function's
-  // AssociatedData Area (ADA). Tha ADA is passed to the function by the
-  // caller in the XPLink ABI defined register R5.
-  // Operand 0: the GlobalValue/External Symbol
-  // Operand 1: the ADA register
-  // Operand 2: the offset (0 for the first and 8 for the second element in the
-  // function descriptor)
-  ADA_ENTRY,
-
-  // Strict variants of scalar floating-point comparisons.
-  // Quiet and signaling versions.
-  FIRST_STRICTFP_OPCODE,
-  STRICT_FCMP = FIRST_STRICTFP_OPCODE,
-  STRICT_FCMPS,
-
-  // Strict variants of vector floating-point comparisons.
-  // Quiet and signaling versions.
-  STRICT_VFCMPE,
-  STRICT_VFCMPH,
-  STRICT_VFCMPHE,
-  STRICT_VFCMPES,
-  STRICT_VFCMPHS,
-  STRICT_VFCMPHES,
-
-  // Strict variants of VEXTEND and VROUND.
-  STRICT_VEXTEND,
-  STRICT_VROUND,
-  LAST_STRICTFP_OPCODE = STRICT_VROUND,
-
-  // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
-  // ATOMIC_LOAD_<op>.
-  //
-  // Operand 0: the address of the containing 32-bit-aligned field
-  // Operand 1: the second operand of <op>, in the high bits of an i32
-  //            for everything except ATOMIC_SWAPW
-  // Operand 2: how many bits to rotate the i32 left to bring the first
-  //            operand into the high bits
-  // Operand 3: the negative of operand 2, for rotating the other way
-  // Operand 4: the width of the field in bits (8 or 16)
-  FIRST_MEMORY_OPCODE,
-  ATOMIC_SWAPW = FIRST_MEMORY_OPCODE,
-  ATOMIC_LOADW_ADD,
-  ATOMIC_LOADW_SUB,
-  ATOMIC_LOADW_AND,
-  ATOMIC_LOADW_OR,
-  ATOMIC_LOADW_XOR,
-  ATOMIC_LOADW_NAND,
-  ATOMIC_LOADW_MIN,
-  ATOMIC_LOADW_MAX,
-  ATOMIC_LOADW_UMIN,
-  ATOMIC_LOADW_UMAX,
-
-  // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
-  //
-  // Operand 0: the address of the containing 32-bit-aligned field
-  // Operand 1: the compare value, in the low bits of an i32
-  // Operand 2: the swap value, in the low bits of an i32
-  // Operand 3: how many bits to rotate the i32 left to bring the first
-  //            operand into the high bits
-  // Operand 4: the negative of operand 2, for rotating the other way
-  // Operand 5: the width of the field in bits (8 or 16)
-  ATOMIC_CMP_SWAPW,
-
-  // Atomic compare-and-swap returning CC value.
-  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-  ATOMIC_CMP_SWAP,
-
-  // 128-bit atomic load.
-  // Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
-  ATOMIC_LOAD_128,
-
-  // 128-bit atomic store.
-  // OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
-  ATOMIC_STORE_128,
-
-  // 128-bit atomic compare-and-swap.
-  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-  ATOMIC_CMP_SWAP_128,
-
-  // Byte swapping load/store.  Same operands as regular load/store.
-  LRV, STRV,
-
-  // Element swapping load/store.  Same operands as regular load/store.
-  VLER, VSTER,
-
-  // Use STORE CLOCK FAST to store current TOD clock value.
-  STCKF,
-
-  // Prefetch from the second operand using the 4-bit control code in
-  // the first operand.  The code is 1 for a load prefetch and 2 for
-  // a store prefetch.
-  PREFETCH,
-  LAST_MEMORY_OPCODE = PREFETCH,
-};
-
-// Return true if OPCODE is some kind of PC-relative address.
-inline bool isPCREL(unsigned Opcode) {
-  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
-}
-} // end namespace SystemZISD
 
 namespace SystemZICMP {
 // Describes whether an integer comparison needs to be signed or unsigned,
@@ -532,8 +148,6 @@ class SystemZTargetLowering : public TargetLowering {
     return true;
   }
 
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
   // This function currently returns cost for srl/ipm/cc sequence for merging.
   CondMergingParams
   getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 547d3dcf92804..a02cafaaafcdf 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -265,74 +265,151 @@ def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
                                   SDNPOutGlue]>;
 def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
 
-// Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+// Return with a glue operand.  Operand 0 is the chain operand.
 def z_retglue           : SDNode<"SystemZISD::RET_GLUE", SDTNone,
                                  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Calls a function.  Operand 0 is the chain operand and operand 1
+// is the target address.  The arguments start at operand 2.
+// There is an optional glue operand at the end.
 def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
 def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+// TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
+// (The call target is implicitly __tls_get_offset.)
 def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                   SDNPVariadic]>;
 def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                   SDNPVariadic]>;
+
+// Wraps a TargetGlobalAddress that should be loaded using PC-relative
+// accesses (LARL).  Operand 0 is the address.
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+
+// Used in cases where an offset is applied to a TargetGlobalAddress.
+// Operand 0 is the full TargetGlobalAddress and operand 1 is a
+// PCREL_WRAPPER for an anchor point.  This is used so that we can
+// cheaply refer to either the full address or the anchor point
+// as a register base.
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
+
+// Integer comparisons.  There are three operands: the two values
+// to compare, and an integer of type SystemZICMP.
 def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
+
+// Floating-point comparisons.  The two operands are the values to compare.
 def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
-def z_strict_fcmp       : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
-                                 [SDNPHasChain]>;
-def z_strict_fcmps      : SDNode<"SystemZISD::STRICT_FCMPS", SDT_ZCmp,
-                                 [SDNPHasChain]>;
+
+let IsStrictFP = true in {
+  // Strict variants of scalar floating-point comparisons.
+  // Quiet and signaling versions.
+  def z_strict_fcmp : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
+                             [SDNPHasChain]>;
+  def z_strict_fcmps : SDNode<"SystemZISD::STRICT_FCMPS", SDT_ZCmp,
+                              [SDNPHasChain]>;
+}
+
+// Test under mask.  The first operand is ANDed with the second operand
+// and the condition codes are set on the result.  The third operand is
+// a boolean that is true if the condition codes need to distinguish
+// between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+// register forms do but the memory forms don't).
 def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp>;
+
+// Branches if a condition is true.  Operand 0 is the chain operand;
+// operand 1 is the 4-bit condition-code mask, with bit N in
+// big-endian order meaning "branch if CC=N"; operand 2 is the
+// target block and operand 3 is the flag operand.
 def z_br_ccmask_1       : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
                                  [SDNPHasChain]>;
+
+// Selects between operand 0 and operand 1.  Operand 2 is the
+// mask of condition-code values for which operand 0 should be
+// chosen over operand 1; it has the same form as BR_CCMASK.
+// Operand 3 is the flag operand.
 def z_select_ccmask_1   : SDNode<"SystemZISD::SELECT_CCMASK",
                                  SDT_ZSelectCCMask>;
+
+// Store the CC value in bits 29 and 28 of an integer.
 def z_ipm_1             : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
+
+// Evaluates to the gap between the stack pointer and the
+// base of the dynamically-allocatable area.
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+
+// For allocating stack space when using stack clash protector.
+// Allocation is performed by block, and each block is probed.
 def z_probed_alloca     : SDNode<"SystemZISD::PROBED_ALLOCA", SDT_ZProbedAlloca,
                                  [SDNPHasChain]>;
+
+// Count number of bits set in operand 0 per byte.
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+
+// Wrappers around the ISD opcodes of the same name.  The output is GR128.
+// Input operands may be GR64 or GR32, depending on the instruction.
 def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
+
+// Add/subtract with overflow/carry.  These have the same operands as
+// the corresponding standard operations, except with the carry flag
+// replaced by a condition code value.
 def z_saddo             : SDNode<"SystemZISD::SADDO", SDT_ZBinaryWithFlags>;
 def z_ssubo             : SDNode<"SystemZISD::SSUBO", SDT_ZBinaryWithFlags>;
 def z_uaddo             : SDNode<"SystemZISD::UADDO", SDT_ZBinaryWithFlags>;
 def z_usubo             : SDNode<"SystemZISD::USUBO", SDT_ZBinaryWithFlags>;
 def z_addcarry_1        : SDNode<"SystemZISD::ADDCARRY", SDT_ZBinaryWithCarry>;
 def z_subcarry_1        : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
+
+// Compute carry/borrow indication for add/subtract.
 def z_vacc              : SDNode<"SystemZISD::VACC", SDTIntBinOp>;
-def z_vac               : SDNode<"SystemZISD::VAC", SDT_ZTernary>;
-def z_vaccc             : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
 def z_vscbi             : SDNode<"SystemZISD::VSCBI", SDTIntBinOp>;
+
+// Add/subtract with carry/borrow.
+def z_vac               : SDNode<"SystemZISD::VAC", SDT_ZTernary>;
 def z_vsbi              : SDNode<"SystemZISD::VSBI", SDT_ZTernary>;
+
+// Compute carry/borrow indication for add/subtract with carry/borrow.
+def z_vaccc             : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
 def z_vsbcbi            : SDNode<"SystemZISD::VSBCBI", SDT_ZTernary>;
+
+// High-word multiply-and-add.
 def z_vmah              : SDNode<"SystemZISD::VMAH", SDT_ZTernary>;
 def z_vmalh             : SDNode<"SystemZISD::VMALH", SDT_ZTernary>;
+
+// Widen and multiply even/odd vector elements.
 def z_vme               : SDNode<"SystemZISD::VME", SDT_ZBinaryConv>;
 def z_vmle              : SDNode<"SystemZISD::VMLE", SDT_ZBinaryConv>;
 def z_vmo               : SDNode<"SystemZISD::VMO", SDT_ZBinaryConv>;
 def z_vmlo              : SDNode<"SystemZISD::VMLO", SDT_ZBinaryConv>;
 
+// Byte swapping load/store.  Same operands as regular load/store.
 def z_loadbswap        : SDNode<"SystemZISD::LRV", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storebswap       : SDNode<"SystemZISD::STRV", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Element swapping load/store.  Same operands as regular load/store.
 def z_loadeswap        : SDNode<"SystemZISD::VLER", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storeeswap       : SDNode<"SystemZISD::VSTER", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Use STORE CLOCK FAST to store current TOD clock value.
 def z_stckf            : SDNode<"SystemZISD::STCKF", SDT_ZStoreInherent,
                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+// Test Data Class.
+//
+// Operand 0: the value to test
+// Operand 1: the bit mask
 def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
 
 def z_eh_sjlj_setjmp    : SDNode<"ISD::EH_SJLJ_SETJMP", SDT_ZSetJmp,
@@ -346,26 +423,75 @@ def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
                                  SDT_ZInsertVectorElt>;
 def z_vector_extract    : SDNode<"ISD::EXTRACT_VECTOR_ELT",
                                  SDT_ZExtractVectorElt>;
+
+// Create a vector constant by filling byte N of the result with bit
+// 15-N of the single operand.
 def z_byte_mask         : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>;
+
+// Create a vector constant by replicating an element-sized RISBG-style mask.
+// The first operand specifies the starting set bit and the second operand
+// specifies the ending set bit.  Both operands count from the MSB of the
+// element.
 def z_rotate_mask       : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>;
+
+// Replicate a GPR scalar value into all elements of a vector.
 def z_replicate         : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>;
+
+// Create a vector from two i64 GPRs.
 def z_join_dwords       : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>;
+
+// Replicate one element of a vector into all elements.  The first operand
+// is the vector and the second is the index of the element to replicate.
 def z_splat             : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
+
+// Interleave elements from the high half of operand 0 and the high half
+// of operand 1.
 def z_merge_high        : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
+
+// Likewise for the low halves.
 def z_merge_low         : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
+
+// Concatenate the vectors in the first two operands, shift them left
+// by the third operand, and take the first half of the result.
 def z_shl_double        : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
+
+// Concatenate the vectors in the first two operands, shift them left/right
+// bitwise by the third operand, and take the first/last half of the result.
 def z_shl_double_bit    : SDNode<"SystemZISD::SHL_DOUBLE_BIT", SDT_ZVecTernaryInt>;
 def z_shr_double_bit    : SDNode<"SystemZISD::SHR_DOUBLE_BIT", SDT_ZVecTernaryInt>;
+
+// Take one element of the first v2i64 operand and the one element of
+// the second v2i64 operand and concatenate them to form a v2i64 result.
+// The third operand is a 4-bit value of the form 0A0B, where A and B
+// are the element selectors for the first operand and second operands
+// respectively.
 def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
                                  SDT_ZVecTernaryInt>;
+
+// Perform a general vector permute on vector operands 0 and 1.
+// Each byte of operand 2 controls the corresponding byte of the result,
+// in the same way as a byte-level VECTOR_SHUFFLE mask.
 def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
+
+// Pack vector operands 0 and 1 into a single vector with half-sized elements.
 def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
+
+// Likewise, but saturate the result and set CC.  PACKS_CC does signed
+// saturation and PACKLS_CC does unsigned saturation.
 def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConvCC>;
 def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConvCC>;
+
+// Unpack the first half of vector operand 0 into double-sized elements.
+// UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
 def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnpack>;
 def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnpack>;
+
+// Likewise for the second half.
 def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnpack>;
 def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnpack>;
+
+// Shift/rotate each element of vector operand 0 by the number of bits
+// specified by scalar operand 1.
 def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@@ -374,40 +500,75 @@ def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vrotl_by_scalar   : SDNode<"SystemZISD::VROTL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
+
+// For each element of the output type, sum across all sub-elements of
+// operand 0 belonging to the corresponding element, and add in the
+// rightmost sub-element of the corresponding element of operand 1.
 def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZBinaryConv>;
+
+// Compare integer vector operands 0 and 1 to produce the usual 0/-1
+// vector result.  VICMPE is for equality, VICMPH for "signed greater than"
+// and VICMPHL for "unsigned greater than".
 def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecCompare>;
 def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecCompare>;
 def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecCompare>;
+
+// Likewise, but also set the condition codes on the result.
 def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecCompareCC>;
 def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecCompareCC>;
 def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecCompareCC>;
+
+// Compare floating-point vector operands 0 and 1 to produce the usual 0/-1
+// vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
+// greater than" and VFCMPHE for "ordered and greater than or equal to".
 def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
-def z_strict_vfcmpe     : SDNode<"SystemZISD::STRICT_VFCMPE",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmpes    : SDNode<"SystemZISD::STRICT_VFCMPES",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
 def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
-def z_strict_vfcmph     : SDNode<"SystemZISD::STRICT_VFCMPH",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmphs    : SDNode<"SystemZISD::STRICT_VFCMPHS",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
 def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
-def z_strict_vfcmphe    : SDNode<"SystemZISD::STRICT_VFCMPHE",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmphes   : SDNode<"SystemZISD::STRICT_VFCMPHES",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+
+// Likewise, but also set the condition codes on the result.
 def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConvCC>;
 def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConvCC>;
 def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConvCC>;
+
+// Extend the even f32 elements of vector operand 0 to produce a vector
+// of f64 elements.
 def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
-def z_strict_vextend    : SDNode<"SystemZISD::STRICT_VEXTEND",
-                                 SDT_ZVecUnaryConv, [SDNPHasChain]>;
+
+// Round the f64 elements of vector operand 0 to f32s and store them in the
+// even elements of the result.
 def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
-def z_strict_vround     : SDNode<"SystemZISD::STRICT_VROUND",
+
+let IsStrictFP = true in {
+  // Strict variants of vector floating-point comparisons.
+  // Quiet and signaling versions.
+  def z_strict_vfcmpe   : SDNode<"SystemZISD::STRICT_VFCMPE",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmph   : SDNode<"SystemZISD::STRICT_VFCMPH",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphe  : SDNode<"SystemZISD::STRICT_VFCMPHE",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmpes  : SDNode<"SystemZISD::STRICT_VFCMPES",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphs  : SDNode<"SystemZISD::STRICT_VFCMPHS",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphes : SDNode<"SystemZISD::STRICT_VFCMPHES",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+
+  // Strict variants of VEXTEND and VROUND.
+  def z_strict_vextend  : SDNode<"SystemZISD::STRICT_VEXTEND",
+                                 SDT_ZVecUnaryConv, [SDNPHasChain]>;
+  def z_strict_vround   : SDNode<"SystemZISD::STRICT_VROUND",
                                  SDT_ZVecUnaryConv, [SDNPHasChain]>;
+}
+
+// AND the two vector operands together and set CC based on the result.
 def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp>;
+
+// i128 high integer comparisons.
 def z_scmp128hi         : SDNode<"SystemZISD::SCMP128HI", SDT_ZCmp>;
 def z_ucmp128hi         : SDNode<"SystemZISD::UCMP128HI", SDT_ZCmp>;
+
+// String operations that set CC as a side-effect.
 def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryIntCC>;
 def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryIntCC>;
 def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinaryCC>;
@@ -423,12 +584,24 @@ def z_vstrs_cc          : SDNode<"SystemZISD::VSTRS_CC",
                                  SDT_ZVecTernaryConvCC>;
 def z_vstrsz_cc         : SDNode<"SystemZISD::VSTRSZ_CC",
                                  SDT_ZVecTernaryConvCC>;
+
+// Test floating-point data class for vectors.
 def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
 
 class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
   : SDNode<"SystemZISD::"#name, profile,
            [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
+// Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+// ATOMIC_LOAD_<op>.
+//
+// Operand 0: the address of the containing 32-bit-aligned field
+// Operand 1: the second operand of <op>, in the high bits of an i32
+//            for everything except ATOMIC_SWAPW
+// Operand 2: how many bits to rotate the i32 left to bring the first
+//            operand into the high bits
+// Operand 3: the negative of operand 2, for rotating the other way
+// Operand 4: the width of the field in bits (8 or 16)
 def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
 def z_atomic_loadw_add  : AtomicWOp<"ATOMIC_LOADW_ADD">;
 def z_atomic_loadw_sub  : AtomicWOp<"ATOMIC_LOADW_SUB">;
@@ -441,55 +614,117 @@ def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
 def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 
+// Atomic compare-and-swap returning CC value.
+// Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
 def z_atomic_cmp_swap   : SDNode<"SystemZISD::ATOMIC_CMP_SWAP",
                                  SDT_ZAtomicCmpSwap,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                   SDNPMemOperand]>;
+
+// A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+//
+// Operand 0: the address of the containing 32-bit-aligned field
+// Operand 1: the compare value, in the low bits of an i32
+// Operand 2: the swap value, in the low bits of an i32
+// Operand 3: how many bits to rotate the i32 left to bring the first
+//            operand into the high bits
+// Operand 4: the negative of operand 2, for rotating the other way
+// Operand 5: the width of the field in bits (8 or 16)
 def z_atomic_cmp_swapw  : SDNode<"SystemZISD::ATOMIC_CMP_SWAPW",
                                  SDT_ZAtomicCmpSwapW,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                   SDNPMemOperand]>;
 
+// 128-bit atomic load.
+// Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
 def z_atomic_load_128   : SDNode<"SystemZISD::ATOMIC_LOAD_128",
                                  SDT_ZAtomicLoad128,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// 128-bit atomic store.
+// OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
 def z_atomic_store_128  : SDNode<"SystemZISD::ATOMIC_STORE_128",
                                  SDT_ZAtomicStore128,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// 128-bit atomic compare-and-swap.
+// Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
 def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
                                    SDT_ZAtomicCmpSwap128,
                                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                     SDNPMemOperand]>;
 
+// Use a series of MVCs to copy bytes from one memory location to another.
+// The operands are:
+// - the target address
+// - the source address
+// - the constant length
+//
+// This isn't a memory opcode because we'd need to attach two
+// MachineMemOperands rather than one.
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Similar to MVC, but for logic operations (AND, OR, XOR).
 def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use CLC to compare two blocks of memory, with the same comments
+// as for MVC.
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Use MVC to set a block of memory after storing the first byte.
 def z_memset_mvc        : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use a CLST-based sequence to implement strcmp().  The two input operands
+// are the addresses of the strings to compare.
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Use an MVST-based sequence to implement stpcpy().
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use an SRST-based sequence to search a block of memory.  The first
+// operand is the end address, the second is the start, and the third
+// is the character to search for.  CC is set to 1 on success and 2
+// on failure.
 def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Prefetch from the second operand using the 4-bit control code in
+// the first operand.  The code is 1 for a load prefetch and 2 for
+// a store prefetch.
 def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
                                   SDNPMemOperand]>;
 
+// Transaction begin.  The first operand is the chain, the second
+// the TDB pointer, and the third the immediate control field.
+// Returns CC value and chain.
 def z_tbegin            : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
                                  [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
 def z_tbegin_nofloat    : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
                                  [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+// Transaction end.  Just the chain operand.  Returns CC value and chain.
 def z_tend              : SDNode<"SystemZISD::TEND", SDT_ZTEnd,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
+// z/OS XPLINK ADA Entry
+// Wraps a TargetGlobalAddress that should be loaded from a function's
+// AssociatedData Area (ADA). Tha ADA is passed to the function by the
+// caller in the XPLink ABI defined register R5.
+// Operand 0: the GlobalValue/External Symbol
+// Operand 1: the ADA register
+// Operand 2: the offset (0 for the first and 8 for the second element in the
+// function descriptor)
 def z_ada_entry         : SDNode<"SystemZISD::ADA_ENTRY",
                                   SDT_ZADAENTRY>;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index eb00d484af693..88feba8adce0e 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -10,21 +10,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZSelectionDAGInfo.h"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 
+#define GET_SDNODE_DESC
+#include "SystemZGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-bool SystemZSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= SystemZISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= SystemZISD::LAST_MEMORY_OPCODE;
-}
+SystemZSelectionDAGInfo::SystemZSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(SystemZGenSDNodeInfo) {}
+
+const char *SystemZSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<SystemZISD::NodeType>(Opcode)) {
+  case SystemZISD::GET_CCMASK:
+    return "SystemZISD::GET_CCMASK";
+  }
 
-bool SystemZSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
-  return Opcode >= SystemZISD::FIRST_STRICTFP_OPCODE &&
-         Opcode <= SystemZISD::LAST_STRICTFP_OPCODE;
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
 static unsigned getMemMemLenAdj(unsigned Op) {
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 200566f9646c1..d25fddab65161 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -15,15 +15,34 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "SystemZGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace SystemZISD {
+
+enum NodeType : unsigned {
+  // Set the condition code from a boolean value in operand 0.
+  // Operand 1 is a mask of all condition-code values that may result of this
+  // operation, operand 2 is a mask of condition-code values that may result
+  // if the boolean is true.
+  // Note that this operation is always optimized away, we will never
+  // generate any code for it.
+  GET_CCMASK = GENERATED_OPCODE_END,
+};
 
-class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
-public:
-  explicit SystemZSelectionDAGInfo() = default;
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+}
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+} // namespace SystemZISD
+
+class SystemZSelectionDAGInfo : public SelectionDAGGenTargetInfo {
+public:
+  SystemZSelectionDAGInfo();
 
-  bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Src,

From b32c434426a181590234f65f2e32f43735bf5b5a Mon Sep 17 00:00:00 2001
From: Prabhu Rajasekaran <prabhukr@google.com>
Date: Mon, 17 Nov 2025 12:08:39 -0800
Subject: [PATCH 077/105] [libc][Github] Perform baremetal libc builds
 (#167583)

Currently there are no 32 bit presubmit builds for libc. This PR
performs 32 bit build only (no test) to check any changes that land in
libc break 32 bit builds.

Co-authored-by: Aiden Grossman <aidengrossman@google.com>
---
 .github/workflows/libc-fullbuild-tests.yml    | 83 +++++++++++++++----
 libc/cmake/caches/armv6m-none-eabi.cmake      |  8 ++
 libc/cmake/caches/armv7em-none-eabi.cmake     |  8 ++
 libc/cmake/caches/armv7m-none-eabi.cmake      |  8 ++
 .../caches/armv8.1m.main-none-eabi.cmake      |  8 ++
 libc/cmake/caches/armv8m.main-none-eabi.cmake |  8 ++
 libc/cmake/caches/baremetal_common.cmake      | 21 +++++
 libc/cmake/caches/riscv32-unknown-elf.cmake   |  4 +
 8 files changed, 132 insertions(+), 16 deletions(-)
 create mode 100644 libc/cmake/caches/armv6m-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv7em-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv7m-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv8.1m.main-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv8m.main-none-eabi.cmake
 create mode 100644 libc/cmake/caches/baremetal_common.cmake
 create mode 100644 libc/cmake/caches/riscv32-unknown-elf.cmake

diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 3a048aeb9405b..c5b7f606a115a 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -48,6 +48,42 @@ jobs:
             cpp_compiler: clang++-22
             target: x86_64-unknown-uefi-llvm
             include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv6m-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv7m-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv7em-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv8m.main-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv8.1m.main-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: riscv32-unknown-elf
+            include_scudo: OFF
           # TODO: add back gcc build when it is fixed
           # - c_compiler: gcc
           #   cpp_compiler: g++
@@ -93,28 +129,39 @@ jobs:
       run: |
         export RUNTIMES="libc"
 
+        export CMAKE_FLAGS="
+          -G Ninja
+          -S ${{ github.workspace }}/runtimes
+          -B ${{ steps.strings.outputs.build-output-dir }}
+          -DCMAKE_ASM_COMPILER=${{ matrix.c_compiler }}
+          -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+          -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DCMAKE_C_COMPILER_LAUNCHER=sccache
+          -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+          -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }}"
+
         if [[ ${{ matrix.include_scudo}} == "ON" ]]; then
           export RUNTIMES="$RUNTIMES;compiler-rt"
-          export CMAKE_FLAGS="
+          export CMAKE_FLAGS="$CMAKE_FLAGS
             -DLLVM_LIBC_INCLUDE_SCUDO=ON
             -DCOMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC=ON
             -DCOMPILER_RT_BUILD_GWP_ASAN=OFF
             -DCOMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED=OFF"
         fi
 
-        cmake -B ${{ steps.strings.outputs.build-output-dir }} \
-        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} \
-        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} \
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-        -DCMAKE_C_COMPILER_LAUNCHER=sccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-        -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }} \
-        -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
-        -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
-        -DLLVM_LIBC_FULL_BUILD=ON \
-        -G Ninja \
-        -S ${{ github.workspace }}/runtimes \
-        $CMAKE_FLAGS
+        case "${{ matrix.target }}" in
+          *-none-eabi|riscv32-unknown-elf)
+            cmake $CMAKE_FLAGS \
+              -C ${{ github.workspace }}/libc/cmake/caches/${{ matrix.target }}.cmake
+            ;;
+          *)
+            cmake -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
+              -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
+              -DLLVM_LIBC_FULL_BUILD=ON \
+              $CMAKE_FLAGS
+            ;;
+        esac
 
     - name: Build
       run: >
@@ -124,8 +171,12 @@ jobs:
         --target install
 
     - name: Test
-      # Skip UEFI tests until we have testing set up.
-      if: ${{ ! endsWith(matrix.target, '-uefi-llvm') }}
+      # Skip UEFI and baremetal tests until we have testing set up.
+      if: ${{
+          !endsWith(matrix.target, '-uefi-llvm') &&
+          !endsWith(matrix.target, '-none-eabi') &&
+          matrix.target != 'riscv32-unknown-elf'
+        }}
       run: >
         cmake 
         --build ${{ steps.strings.outputs.build-output-dir }} 
diff --git a/libc/cmake/caches/armv6m-none-eabi.cmake b/libc/cmake/caches/armv6m-none-eabi.cmake
new file mode 100644
index 0000000000000..1f463ae5c0ead
--- /dev/null
+++ b/libc/cmake/caches/armv6m-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv6m-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv6m -mcpu=cortex-m0plus -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv7em-none-eabi.cmake b/libc/cmake/caches/armv7em-none-eabi.cmake
new file mode 100644
index 0000000000000..afbe9c87dffe1
--- /dev/null
+++ b/libc/cmake/caches/armv7em-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv7em-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv7em -mcpu=cortex-m4 -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv7m-none-eabi.cmake b/libc/cmake/caches/armv7m-none-eabi.cmake
new file mode 100644
index 0000000000000..796adb2f31148
--- /dev/null
+++ b/libc/cmake/caches/armv7m-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv7m-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv7m -mcpu=cortex-m4 -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv8.1m.main-none-eabi.cmake b/libc/cmake/caches/armv8.1m.main-none-eabi.cmake
new file mode 100644
index 0000000000000..4095facce46ac
--- /dev/null
+++ b/libc/cmake/caches/armv8.1m.main-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv8.1m.main-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-mfloat-abi=hard -march=armv8.1-m.main+mve.fp+fp.dp -mcpu=cortex-m55" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv8m.main-none-eabi.cmake b/libc/cmake/caches/armv8m.main-none-eabi.cmake
new file mode 100644
index 0000000000000..4b69f6a822e71
--- /dev/null
+++ b/libc/cmake/caches/armv8m.main-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv8m.main-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/baremetal_common.cmake b/libc/cmake/caches/baremetal_common.cmake
new file mode 100644
index 0000000000000..c0d665d790393
--- /dev/null
+++ b/libc/cmake/caches/baremetal_common.cmake
@@ -0,0 +1,21 @@
+# Expects target triple to be passed as `RUNTIMES_TARGET_TRIPLE`
+
+set(CMAKE_SYSTEM_NAME Generic CACHE STRING "")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "libc" CACHE STRING "")
+set(LLVM_INCLUDE_TESTS OFF CACHE BOOL "")
+set(CMAKE_C_COMPILER_WORKS ON CACHE BOOL "")
+set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
+set(CMAKE_SYSROOT "" CACHE STRING "")
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(CMAKE_ASM_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(LLVM_DEFAULT_TARGET_TRIPLE ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(LIBC_TARGET_TRIPLE ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+
+set(LLVM_LIBC_FULL_BUILD "ON" CACHE BOOL "")
diff --git a/libc/cmake/caches/riscv32-unknown-elf.cmake b/libc/cmake/caches/riscv32-unknown-elf.cmake
new file mode 100644
index 0000000000000..960fb2bb51a4f
--- /dev/null
+++ b/libc/cmake/caches/riscv32-unknown-elf.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_PROCESSOR RISCV CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "riscv32-unknown-elf" CACHE STRING "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)

From 557a6b826b865cd1797ae421f59f286609b94e59 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 17 Nov 2025 20:15:33 +0000
Subject: [PATCH 078/105] [lldb][NFC] use llvm::erase_if to remove non matching
 types (#168279)

---
 lldb/source/Symbol/Symtab.cpp | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 6080703998ff2..9964ae492bc00 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -722,15 +722,11 @@ Symtab::AppendSymbolIndexesWithNameAndType(ConstString symbol_name,
                                            std::vector<uint32_t> &indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  if (AppendSymbolIndexesWithName(symbol_name, indexes) > 0) {
-    std::vector<uint32_t>::iterator pos = indexes.begin();
-    while (pos != indexes.end()) {
-      if (symbol_type == eSymbolTypeAny ||
-          m_symbols[*pos].GetType() == symbol_type)
-        ++pos;
-      else
-        pos = indexes.erase(pos);
-    }
+  if (AppendSymbolIndexesWithName(symbol_name, indexes) > 0 &&
+      symbol_type != eSymbolTypeAny) {
+    llvm::erase_if(indexes, [this, symbol_type](uint32_t index) {
+      return m_symbols[index].GetType() != symbol_type;
+    });
   }
   return indexes.size();
 }
@@ -742,15 +738,11 @@ uint32_t Symtab::AppendSymbolIndexesWithNameAndType(
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
   if (AppendSymbolIndexesWithName(symbol_name, symbol_debug_type,
-                                  symbol_visibility, indexes) > 0) {
-    std::vector<uint32_t>::iterator pos = indexes.begin();
-    while (pos != indexes.end()) {
-      if (symbol_type == eSymbolTypeAny ||
-          m_symbols[*pos].GetType() == symbol_type)
-        ++pos;
-      else
-        pos = indexes.erase(pos);
-    }
+                                  symbol_visibility, indexes) > 0 &&
+      symbol_type != eSymbolTypeAny) {
+    llvm::erase_if(indexes, [this, symbol_type](uint32_t index) {
+      return m_symbols[index].GetType() != symbol_type;
+    });
   }
   return indexes.size();
 }

From bac8d01a4da14802ec03907d094f3bbc68f6a5cc Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 12:18:01 -0800
Subject: [PATCH 079/105] [bazel][libc] Fixes #165219 (#168429)

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 788c6570081a2..a27abbd5b386a 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1103,6 +1103,7 @@ libc_support_library(
         ":func_realloc",
         ":hdr_stdio_macros",
         ":hdr_stdio_overlay",
+        ":string_memory_utils",
         ":types_off_t",
     ],
 )

From 3fb374256b2fcd3dc091612c6c18a6ad6b6bf138 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 12:38:57 -0800
Subject: [PATCH 080/105] [bazel] Fix #168113 (#168434)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d582f448c2213..d021f9da38dbb 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3177,6 +3177,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-subtarget"],
                 "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/SystemZ/SystemZGenSDNodeInfo.inc",
+            )
         ],
     },
     {

From 4bec74a9fb82b70db0c1acfc3d1d92d8003d51fd Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 12:42:46 -0800
Subject: [PATCH 081/105] [mlir][bazel] Fix #168066 (#168435)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 153c7eeedd0ab..452380a8953ff 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -13105,6 +13105,7 @@ cc_library(
         ":RuntimeVerifiableOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
+        ":UBDialect",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",

From b00588ffb4f518605b3a1778458e38f21784b9fa Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Mon, 17 Nov 2025 15:46:14 -0500
Subject: [PATCH 082/105] Fix bazel dep caused by f5b73760 (#168436)


From 321b9d190b32c2c10bbd59761e34ef0305bdb954 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 15:45:29 +0000
Subject: [PATCH 083/105] [VPlan] Replace VPIRMetadata::addMetadata with
 setMetadata. (NFC)

Replace addMetadata with setMetadata, which sets metadata, updating
existing entries or adding a new entry otherwise.

This isn't strictly needed at the moment, but will be needed for
follow-up patches.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 19 +++++++++++--------
 .../Vectorize/VPlanConstruction.cpp           |  6 +++---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0932922c07126..67fa294d095bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -970,14 +970,17 @@ class VPIRMetadata {
   /// Add all metadata to \p I.
   void applyMetadata(Instruction &I) const;
 
-  /// Add metadata with kind \p Kind and \p Node.
-  void addMetadata(unsigned Kind, MDNode *Node) {
-    assert(none_of(Metadata,
-                   [Kind](const std::pair<unsigned, MDNode *> &P) {
-                     return P.first == Kind;
-                   }) &&
-           "Kind must appear at most once in Metadata");
-    Metadata.emplace_back(Kind, Node);
+  /// Set metadata with kind \p Kind to \p Node. If metadata with \p Kind
+  /// already exists, it will be replaced. Otherwise, it will be added.
+  void setMetadata(unsigned Kind, MDNode *Node) {
+    auto It =
+        llvm::find_if(Metadata, [Kind](const std::pair<unsigned, MDNode *> &P) {
+          return P.first == Kind;
+        });
+    if (It != Metadata.end())
+      It->second = Node;
+    else
+      Metadata.emplace_back(Kind, Node);
   }
 
   /// Intersect this VPIRMetada object with \p MD, keeping only metadata
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4ffd5577d31a4..aed85271350c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -672,7 +672,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights =
         MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
-    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+    Term->setMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
 
@@ -756,7 +756,7 @@ void VPlanTransforms::addMinimumIterationCheck(
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights = MDB.createBranchWeights(
         ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
-    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+    Term->setMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
 
@@ -793,7 +793,7 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
   MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights(Weights, /*IsExpected=*/false);
-  Branch->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  Branch->setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
 /// If \p RedPhiR is used by a ComputeReductionResult recipe, return it.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index bbeb447de45cb..3e2c47e4556a6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4458,7 +4458,7 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
   MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
-  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the

From 54c2c7cf0da21bf7d85f144aa6cb6875e2a9373a Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Mon, 17 Nov 2025 13:01:52 -0800
Subject: [PATCH 084/105] [LLDB] Fix test compilation errors under asan (NFC)
 (#168408)

https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/lldb-cmake-sanitized/2744/consoleText
---
 lldb/packages/Python/lldbsuite/test/make/Makefile.rules     | 4 +++-
 .../commands/target/auto-install-main-executable/Makefile   | 2 +-
 .../macosx/find-dsym/bundle-with-dot-in-filename/Makefile   | 4 ++--
 lldb/test/API/macosx/find-dsym/deep-bundle/Makefile         | 4 ++--
 lldb/test/API/macosx/posix_spawn/Makefile                   | 6 +++---
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 0122fe8409c29..55dbd3934860f 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -339,9 +339,11 @@ endif
 # library to make ASAN tests work for most users, including the bots.
 ifeq "$(OS)" "Darwin"
 ifneq "$(ASAN_OPTIONS)" ""
-LDFLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
+ASAN_LDFLAGS = -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
 endif
 endif
+LDFLAGS += $(ASAN_LDFLAGS)
+
 OBJECTS =
 EXE ?= a.out
 
diff --git a/lldb/test/API/commands/target/auto-install-main-executable/Makefile b/lldb/test/API/commands/target/auto-install-main-executable/Makefile
index 07e6c9a1d0f15..d0578fb699d1b 100644
--- a/lldb/test/API/commands/target/auto-install-main-executable/Makefile
+++ b/lldb/test/API/commands/target/auto-install-main-executable/Makefile
@@ -6,4 +6,4 @@ a.out: a.device.out
 include Makefile.rules
 
 a.device.out:
-	$(CXX) $(CXXFLAGS) -DBUILD=74 -o $@ $(SRCDIR)/main.cpp
+	$(CXX) $(ASAN_LDFLAGS) $(CXXFLAGS) -DBUILD=74 -o $@ $(SRCDIR)/main.cpp
diff --git a/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile b/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
index 12781fd847768..f13584041fb51 100644
--- a/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
+++ b/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
@@ -5,7 +5,7 @@ all: clean $(EXE)
 include Makefile.rules
 
 $(EXE):
-	$(CC) $(CFLAGS) -dynamiclib -o com.apple.sbd $(SRCDIR)/bundle.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -dynamiclib -o com.apple.sbd $(SRCDIR)/bundle.c
 	mkdir com.apple.sbd.xpc
 	mv com.apple.sbd com.apple.sbd.xpc/
 	mkdir -p com.apple.sbd.xpc.dSYM/Contents/Resources/DWARF
@@ -13,7 +13,7 @@ $(EXE):
 	rm -rf com.apple.sbd.dSYM
 	mkdir hide.app
 	tar cf - com.apple.sbd.xpc com.apple.sbd.xpc.dSYM | ( cd hide.app;tar xBpf -)
-	$(CC) $(CFLAGS) -o find-bundle-with-dots-in-fn $(SRCDIR)/main.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -o find-bundle-with-dots-in-fn $(SRCDIR)/main.c
 
 clean::
 	rm -rf a.out a.out.dSYM hide.app com.apple.sbd com.apple.sbd.dSYM com.apple.sbd.xpc com.apple.sbd.xpc.dSYM find-bundle-with-dots-in-fn find-bundle-with-dots-in-fn.dSYM
diff --git a/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile b/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
index 806c840c9f2ee..c041d9e7a0e95 100644
--- a/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
+++ b/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
@@ -4,7 +4,7 @@ all: clean $(EXE)
 include Makefile.rules
 
 $(EXE):
-	$(CC) $(CFLAGS) -install_name $(shell pwd)/MyFramework.framework/Versions/A/MyFramework -dynamiclib -o MyFramework $(SRCDIR)/myframework.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -install_name $(shell pwd)/MyFramework.framework/Versions/A/MyFramework -dynamiclib -o MyFramework $(SRCDIR)/myframework.c
 	mkdir -p MyFramework.framework/Versions/A/Headers
 	mkdir -p MyFramework.framework/Versions/A/Resources
 	cp MyFramework MyFramework.framework/Versions/A
@@ -18,7 +18,7 @@ $(EXE):
 	mkdir hide.app
 	rm -f MyFramework
 	tar cf - MyFramework.framework MyFramework.framework.dSYM | ( cd hide.app;tar xBpf -)
-	$(CC) $(CFLAGS) -o deep-bundle $(SRCDIR)/main.c -F. -framework MyFramework
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -o deep-bundle $(SRCDIR)/main.c -F. -framework MyFramework
 
 clean::
 	rm -rf a.out a.out.dSYM deep-bundle deep-bundle.dSYM MyFramework.framework MyFramework.framework.dSYM MyFramework MyFramework.dSYM hide.app
diff --git a/lldb/test/API/macosx/posix_spawn/Makefile b/lldb/test/API/macosx/posix_spawn/Makefile
index 7ae46ca95828d..cbdee9122e3f2 100644
--- a/lldb/test/API/macosx/posix_spawn/Makefile
+++ b/lldb/test/API/macosx/posix_spawn/Makefile
@@ -6,13 +6,13 @@ include Makefile.rules
 all: fat.out
 
 x86_64.out: x86_64.c
-	$(CC) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o x86_64.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o x86_64.out $<
 
 x86_64h.out: x86_64h.c
-	$(CC) -isysroot $(SDKROOT) -target x86_64h-apple-macosx10.9 -o x86_64h.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target x86_64h-apple-macosx10.9 -o x86_64h.out $<
 
 arm64.out: arm64.c
-	$(CC) -isysroot $(SDKROOT) -target arm64-apple-macosx10.9 -o arm64.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target arm64-apple-macosx10.9 -o arm64.out $<
 
 fat.out: x86_64.out x86_64h.out arm64.out
 	$(LIPO) -o fat.out -create $^

From 24c524d01423dd4b922fd4118613717a1b7e7f41 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 17 Nov 2025 16:07:28 -0500
Subject: [PATCH 085/105] [libc++] Enable compiler-rt when performing a
 bootstrapping build (#167065)

Otherwise, we end up using whatever system-provided compiler runtime is
available, which doesn't work on macOS since compiler-rt is located
inside the toolchain path, which can't be found by default.

However, disable the tests for compiler-rt since those are linking
against the system C++ standard library while using the just-built
libc++ headers, which is non-sensical and leads to undefined references
on macOS.
---
 libcxx/docs/VendorDocumentation.rst | 14 ++++++++------
 libcxx/utils/ci/run-buildbot        |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/libcxx/docs/VendorDocumentation.rst b/libcxx/docs/VendorDocumentation.rst
index 15677c7428263..b14c7a70aee04 100644
--- a/libcxx/docs/VendorDocumentation.rst
+++ b/libcxx/docs/VendorDocumentation.rst
@@ -81,12 +81,14 @@ CMake invocation at ``<monorepo>/llvm``:
 .. code-block:: bash
 
   $ mkdir build
-  $ cmake -G Ninja -S llvm -B build -DLLVM_ENABLE_PROJECTS="clang"                      \  # Configure
-                                    -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
-                                    -DLLVM_RUNTIME_TARGETS="<target-triple>"
-  $ ninja -C build runtimes                                                                # Build
-  $ ninja -C build check-runtimes                                                          # Test
-  $ ninja -C build install-runtimes                                                        # Install
+  $ cmake -G Ninja -S llvm -B build                                       \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo                               \
+          -DLLVM_ENABLE_PROJECTS="clang"                                  \  # Configure
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;compiler-rt" \
+          -DLLVM_RUNTIME_TARGETS="<target-triple>"
+  $ ninja -C build runtimes                                                  # Build
+  $ ninja -C build check-runtimes                                            # Test
+  $ ninja -C build install-runtimes                                          # Install
 
 .. note::
   - This type of build is also commonly called a "Runtimes build", but we would like to move
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index d265dddebe11f..7442361627104 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -366,11 +366,12 @@ bootstrapping-build)
           -DCMAKE_BUILD_TYPE=Release \
           -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
           -DLLVM_ENABLE_PROJECTS="clang;lldb" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;compiler-rt" \
           -DLLVM_RUNTIME_TARGETS="$(${CXX} --print-target-triple)" \
           -DLLVM_HOST_TRIPLE="$(${CXX} --print-target-triple)" \
           -DLLVM_TARGETS_TO_BUILD="host" \
           -DRUNTIMES_BUILD_ALLOW_DARWIN=ON \
+          -DCOMPILER_RT_INCLUDE_TESTS=OFF \
           -DLLVM_ENABLE_ASSERTIONS=ON \
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests"
 

From aae2b891e8b28adafde9be1ee2ddd327aa72ccfa Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 17 Nov 2025 16:07:45 -0500
Subject: [PATCH 086/105] [libc++] Replace a few .compile.fail.cpp tests by
 proper clang-verify tests (#167346)

We want to eliminate all .compile.fail.cpp tests since they are brittle:
these tests pass regardless of the specific compilation error, which
means that e.g. a mising include will render the test null.

This is not an exhaustive pass, just a few tests I stumbled upon.
---
 ...mpile.fail.cpp => gets-removed.verify.cpp} | 10 +--
 .../re.tokiter.cnstr/array.compile.fail.cpp   | 40 -----------
 .../re.tokiter.cnstr/init.compile.fail.cpp    | 37 ----------
 .../re.tokiter.cnstr/int.compile.fail.cpp     | 36 ----------
 .../temporary-objects.verify.cpp              | 72 +++++++++++++++++++
 .../re.tokiter.cnstr/vector.compile.fail.cpp  | 41 -----------
 6 files changed, 75 insertions(+), 161 deletions(-)
 rename libcxx/test/std/input.output/file.streams/c.files/{gets.compile.fail.cpp => gets-removed.verify.cpp} (70%)
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
 create mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp

diff --git a/libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
similarity index 70%
rename from libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp
rename to libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
index 1a92cc925e2aa..281ef37e92d27 100644
--- a/libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp
+++ b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11
-// test <cstdio>
 
-// gets
+// Verify that std::gets has been removed in C++14 and later
 
 #include <cstdio>
 
-int main(int, char**)
-{
-    (void) std::gets((char *) NULL);
-
-  return 0;
+void f(char const* str) {
+  (void)std::gets(str); // expected-error {{no member named 'gets' in namespace 'std'}}
 }
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
deleted file mode 100644
index a03fd52c03562..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// template <size_t N>
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      const int (&submatches)[N],
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <vector>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        const int indices[] = {-1, 0, 1};
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), indices);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
deleted file mode 100644
index b6913e6b32d12..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      initializer_list<int> submatches,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                      std::regex("\\d{3}-\\d{4}"), {-1, 0, 1});
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
deleted file mode 100644
index 3c39d4983e26c..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re, int submatch = 0,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-\\d{4}");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), -1);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
new file mode 100644
index 0000000000000..b1ab0f337de2f
--- /dev/null
+++ b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11
+
+// Ensure that we don't allow iterators into temporary std::regex objects.
+
+// <regex>
+//
+// class regex_iterator<BidirectionalIterator, charT, traits>
+//
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re, int submatch = 0,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// template <size_t N>
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      const int (&submatches)[N],
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      initializer_list<int> submatches,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// template <std::size_t N>
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      const std::vector<int>& submatches,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+
+#include <iterator>
+#include <regex>
+#include <vector>
+
+void f() {
+  std::regex phone_numbers("\\d{3}-\\d{4}");
+  const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
+
+  { // int submatch
+    std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), -1);
+    // expected-error@-1 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // const int (&submatches)[N]
+    const int indices[] = {-1, 0, 1};
+    std::cregex_token_iterator i(
+        std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), indices);
+    // expected-error@-2 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // initializer_list<int> submatches
+    std::cregex_token_iterator i(
+        std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), {-1, 0, 1});
+    // expected-error@-2 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // const std::vector<int>& submatches
+    std::vector<int> v;
+    v.push_back(-1);
+    v.push_back(-1);
+    std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), v);
+    // expected-error@-1 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp
deleted file mode 100644
index 9b07df9d1a783..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// template <std::size_t N>
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      const std::vector<int>& submatches,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-         std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::vector<int> v;
-        v.push_back(-1);
-        v.push_back(-1);
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), v);
-    }
-
-  return 0;
-}

From 7693f124ff7fbeacce66ef3012fef119b40db330 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 13:08:06 -0800
Subject: [PATCH 087/105] [mlir][bazel] Fix #167957 (#168441)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 452380a8953ff..1421ec553f251 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10318,6 +10318,8 @@ cc_library(
     ),
     includes = ["include"],
     deps = [
+        ":FunctionInterfaces",
+        ":IR",
         ":OpenACCDialect",
         ":OpenACCOpsIncGen",
         ":OpenACCPassIncGen",
@@ -10325,6 +10327,7 @@ cc_library(
         ":Support",
         ":ViewLikeInterface",
         "//llvm:Support",
+        "//llvm:ir_headers",
     ],
 )
 

From 0d8c29409ceeba7fc0561bae2b9d4e4e4e936cba Mon Sep 17 00:00:00 2001
From: Daniel Wedzicha <danielgitmail@proton.me>
Date: Mon, 17 Nov 2025 17:13:24 -0400
Subject: [PATCH 088/105] Fixed typo in llvm-otool (#168395)

---
 llvm/tools/llvm-objdump/OtoolOpts.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td
index dc7a5b445cffe..706d9e0182f58 100644
--- a/llvm/tools/llvm-objdump/OtoolOpts.td
+++ b/llvm/tools/llvm-objdump/OtoolOpts.td
@@ -14,7 +14,7 @@ def G : Flag<["-"], "G">, HelpText<"print data-in-code table">;
 def h : Flag<["-"], "h">, HelpText<"print mach header">;
 def I : Flag<["-"], "I">, HelpText<"print indirect symbol table">;
 def j : Flag<["-"], "j">, HelpText<"print opcode bytes">;
-def l : Flag<["-"], "l">, HelpText<"print load commnads">;
+def l : Flag<["-"], "l">, HelpText<"print load commands">;
 def L : Flag<["-"], "L">, HelpText<"print used shared libraries">;
 def mcpu_EQ : Joined<["-"], "mcpu=">, HelpText<"select cpu for disassembly">;
 def o : Flag<["-"], "o">, HelpText<"print Objective-C segment">;

From ed617bd78082bb569059f2f698e41cbba5317afb Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 13:21:59 -0800
Subject: [PATCH 089/105] [bazel][buildifier] reformat changes in #168434
 (#168443)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d021f9da38dbb..85c64ffd58ca6 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3180,7 +3180,7 @@ llvm_target_lib_list = [lib for lib in [
             (
                 ["-gen-sd-node-info"],
                 "lib/Target/SystemZ/SystemZGenSDNodeInfo.inc",
-            )
+            ),
         ],
     },
     {

From 3cba379e3d9bd2f929f5625fe38d17c34f4b7bb7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 21:28:49 +0000
Subject: [PATCH 090/105] [VPlan] Populate and use VPIRMetadata from
 VPInstructions (NFC) (#167253)

Update VPlan to populate VPIRMetadata during VPInstruction construction
and use it when creating widened recipes, instead of constructing
VPIRMetadata from the underlying IR instruction each time.

This centralizes VPIRMetadata in VPInstructions and ensures metadata is
consistently available throughout VPlan transformations.

PR: https://github.com/llvm/llvm-project/pull/167253
---
 .../Vectorize/LoopVectorizationPlanner.h      | 41 ++++++------
 .../Transforms/Vectorize/LoopVectorize.cpp    | 39 ++++++------
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 10 +--
 llvm/lib/Transforms/Vectorize/VPlan.h         | 62 +++++++++----------
 .../Vectorize/VPlanConstruction.cpp           | 37 ++++++++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 ----
 llvm/lib/Transforms/Vectorize/VPlanSLP.cpp    |  3 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 35 ++++++-----
 .../Transforms/Vectorize/VPlanTransforms.h    |  3 +-
 .../Transforms/Vectorize/VPlanTest.cpp        |  8 +--
 10 files changed, 127 insertions(+), 123 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 5dc3175382254..f533a47150a7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -63,9 +63,11 @@ class VPBuilder {
   }
 
   VPInstruction *createInstruction(unsigned Opcode,
-                                   ArrayRef<VPValue *> Operands, DebugLoc DL,
+                                   ArrayRef<VPValue *> Operands,
+                                   const VPIRMetadata &MD, DebugLoc DL,
                                    const Twine &Name = "") {
-    return tryInsertInstruction(new VPInstruction(Opcode, Operands, DL, Name));
+    return tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
   }
 
 public:
@@ -150,17 +152,17 @@ class VPBuilder {
   /// its underlying Instruction.
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
+                              const VPIRMetadata &MD = {},
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
-    DebugLoc DL = DebugLoc::getUnknown();
-    if (Inst)
-      DL = Inst->getDebugLoc();
-    VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
+    VPInstruction *NewVPInst = tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               DebugLoc DL, const Twine &Name = "") {
-    return createInstruction(Opcode, Operands, DL, Name);
+    return createInstruction(Opcode, Operands, {}, DL, Name);
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               const VPIRFlags &Flags,
@@ -174,8 +176,8 @@ class VPBuilder {
                               Type *ResultTy, const VPIRFlags &Flags = {},
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
+    return tryInsertInstruction(new VPInstructionWithType(
+        Opcode, Operands, ResultTy, Flags, {}, DL, Name));
   }
 
   VPInstruction *createOverflowingOp(
@@ -189,13 +191,14 @@ class VPBuilder {
   VPInstruction *createNot(VPValue *Operand,
                            DebugLoc DL = DebugLoc::getUnknown(),
                            const Twine &Name = "") {
-    return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
+    return createInstruction(VPInstruction::Not, {Operand}, {}, DL, Name);
   }
 
   VPInstruction *createAnd(VPValue *LHS, VPValue *RHS,
                            DebugLoc DL = DebugLoc::getUnknown(),
                            const Twine &Name = "") {
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
+    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, {}, DL,
+                             Name);
   }
 
   VPInstruction *createOr(VPValue *LHS, VPValue *RHS,
@@ -210,20 +213,18 @@ class VPBuilder {
   VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
                                   DebugLoc DL = DebugLoc::getUnknown(),
                                   const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name));
+    return createNaryOp(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name);
   }
 
   VPInstruction *
   createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "",
                std::optional<FastMathFlags> FMFs = std::nullopt) {
-    auto *Select =
-        FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 *FMFs, {}, DL, Name)
-             : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 DL, Name);
-    return tryInsertInstruction(Select);
+    if (!FMFs)
+      return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
+                          Name);
+    return tryInsertInstruction(new VPInstruction(
+        Instruction::Select, {Cond, TrueVal, FalseVal}, *FMFs, {}, DL, Name));
   }
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
@@ -306,7 +307,7 @@ class VPBuilder {
                                   const VPIRFlags &Flags = {},
                                   const VPIRMetadata &Metadata = {}) {
     return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata));
+        new VPInstructionWithType(Opcode, Op, ResultTy, Flags, Metadata, DL));
   }
 
   VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 10bd6cd471152..356d759b94799 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7616,14 +7616,13 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
   }
   if (VPI->getOpcode() == Instruction::Load) {
     auto *Load = cast<LoadInst>(I);
-    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
+    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
-                                Consecutive, Reverse,
-                                VPIRMetadata(*Store, LVer), VPI->getDebugLoc());
+                                Consecutive, Reverse, *VPI, VPI->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
@@ -7751,7 +7750,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
+    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI,
                                       VPI->getDebugLoc());
 
   Function *Variant = nullptr;
@@ -7843,7 +7842,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       auto *SafeRHS =
           Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
       Ops[1] = SafeRHS;
-      return new VPWidenRecipe(*I, Ops);
+      return new VPWidenRecipe(*I, Ops, *VPI, VPI->getDebugLoc());
     }
     [[fallthrough]];
   }
@@ -7889,7 +7888,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       // For other binops, the legacy cost model only checks the second operand.
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
-    return new VPWidenRecipe(*I, NewOps);
+    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(VPI->operands());
@@ -7897,7 +7896,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
     NewOps.push_back(Plan.getConstantInt(32, Idx));
-    return new VPWidenRecipe(*I, NewOps);
+    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
   }
   };
 }
@@ -7981,8 +7980,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
-  auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform,
-                                       BlockInMask, VPIRMetadata(*I, LVer));
+  auto *Recipe =
+      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI);
   return Recipe;
 }
 
@@ -8235,13 +8234,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
 
   if (VPI->getOpcode() == Instruction::Select)
-    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands());
+    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands(),
+                                   *VPI);
 
   if (Instruction::isCast(VPI->getOpcode())) {
     auto *CastR = cast<VPInstructionWithType>(R);
     auto *CI = cast<CastInst>(Instr);
     return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
-                                 CastR->getResultType(), *CI);
+                                 CastR->getResultType(), *CI, *VPI);
   }
 
   return tryToWiden(VPI);
@@ -8269,7 +8269,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*ReductionI, Ops);
+    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRMetadata(),
+                              ReductionI->getDebugLoc());
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
@@ -8302,7 +8303,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   // candidates built later for specific VF ranges.
   auto VPlan0 = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8408,7 +8409,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // VPInstructions in the loop.
   // ---------------------------------------------------------------------------
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder, BlockMaskCache, LVer);
+                                Builder, BlockMaskCache);
   // TODO: Handle partial reductions with EVL tail folding.
   if (!CM.foldTailWithEVL())
     RecipeBuilder.collectScaledReductions(Range);
@@ -8453,9 +8454,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
         if (Legal->isInvariantStoreOfReduction(SI)) {
-          auto *Recipe =
-              new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
-                                    nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
+          auto *Recipe = new VPReplicateRecipe(
+              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/,
+              *cast<VPInstruction>(SingleDef));
           Recipe->insertBefore(*MiddleVPBB, MBIP);
         }
         R.eraseFromParent();
@@ -8606,7 +8607,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   // addScalarResumePhis.
   DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder, BlockMaskCache, nullptr /*LVer*/);
+                                Builder, BlockMaskCache);
   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index a7000aff06379..87280b83fc0e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -84,10 +84,6 @@ class VPRecipeBuilder {
   /// A mapping of partial reduction exit instructions to their scaling factor.
   DenseMap<const Instruction *, unsigned> ScaledReductionMap;
 
-  /// Loop versioning instance for getting noalias metadata guaranteed by
-  /// runtime checks.
-  LoopVersioning *LVer;
-
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -144,11 +140,9 @@ class VPRecipeBuilder {
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder,
-                  DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
-                  LoopVersioning *LVer)
+                  DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache)
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
-        CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache),
-        LVer(LVer) {}
+        CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache) {}
 
   std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
     auto It = ScaledReductionMap.find(ExitInst);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 67fa294d095bd..c81834e401726 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -65,7 +65,6 @@ class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
 class LoopVectorizationCostModel;
-class LoopVersioning;
 
 struct VPCostContext;
 
@@ -958,10 +957,6 @@ class VPIRMetadata {
   /// \p I.
   VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
 
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
   /// Copy constructor for cloning.
   VPIRMetadata(const VPIRMetadata &Other) = default;
 
@@ -1120,11 +1115,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, {}, {}, DL, Name) {}
-
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, const VPIRMetadata &MD = {},
+                const VPIRFlags &Flags = {}, const VPIRMetadata &MD = {},
                 DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "");
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
@@ -1214,14 +1205,10 @@ class VPInstructionWithType : public VPInstruction {
 
 public:
   VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL,
+                        Type *ResultTy, const VPIRFlags &Flags = {},
+                        const VPIRMetadata &Metadata = {},
+                        DebugLoc DL = DebugLoc::getUnknown(),
                         const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, Flags, {}, DL, Name),
-        ResultTy(ResultTy) {}
-
-  VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags,
-                        const VPIRMetadata &Metadata, const Twine &Name = "")
       : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
         ResultTy(ResultTy) {}
 
@@ -1250,7 +1237,7 @@ class VPInstructionWithType : public VPInstruction {
   VPInstruction *clone() override {
     auto *New =
         new VPInstructionWithType(getOpcode(), operands(), getResultType(),
-                                  *this, getDebugLoc(), getName());
+                                  *this, *this, getDebugLoc(), getName());
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }
@@ -1334,7 +1321,7 @@ class VPPhiAccessors {
 
 struct LLVM_ABI_FOR_TEST VPPhi : public VPInstruction, public VPPhiAccessors {
   VPPhi(ArrayRef<VPValue *> Operands, DebugLoc DL, const Twine &Name = "")
-      : VPInstruction(Instruction::PHI, Operands, DL, Name) {}
+      : VPInstruction(Instruction::PHI, Operands, {}, {}, DL, Name) {}
 
   static inline bool classof(const VPUser *U) {
     auto *VPI = dyn_cast<VPInstruction>(U);
@@ -1478,9 +1465,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
         VPIRMetadata(Metadata), Opcode(Opcode) {}
 
-  VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I),
-        Opcode(I.getOpcode()) {}
+  VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands,
+                const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
+        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {}
 
   ~VPWidenRecipe() override = default;
 
@@ -1521,13 +1509,12 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    CastInst &UI)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPIRMetadata(UI),
-        Opcode(Opcode), ResultTy(ResultTy) {
+                    CastInst &UI, const VPIRMetadata &Metadata)
+      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI),
+        VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
   }
-
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
                     const VPIRFlags &Flags = {},
                     const VPIRMetadata &Metadata = {},
@@ -1590,18 +1577,23 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 public:
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRMetadata &MD = {},
                          DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
-        VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
+        VPIRMetadata(MD), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
         MayWriteToMemory(CI.mayWriteToMemory()),
         MayHaveSideEffects(CI.mayHaveSideEffects()) {}
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRFlags &Flags = {},
+                         const VPIRMetadata &Metadata = {},
                          DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
-        VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, Flags,
+                            DL),
+        VPIRMetadata(Metadata), VectorIntrinsicID(VectorIntrinsicID),
+        ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeSet Attrs = Intrinsic::getFnAttributes(Ctx, VectorIntrinsicID);
     MemoryEffects ME = Attrs.getMemoryEffects();
@@ -1617,9 +1609,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   VPWidenIntrinsicRecipe *clone() override {
     if (Value *CI = getUnderlyingValue())
       return new VPWidenIntrinsicRecipe(*cast<CallInst>(CI), VectorIntrinsicID,
-                                        operands(), ResultTy, getDebugLoc());
+                                        operands(), ResultTy, *this,
+                                        getDebugLoc());
     return new VPWidenIntrinsicRecipe(VectorIntrinsicID, operands(), ResultTy,
-                                      getDebugLoc());
+                                      *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC)
@@ -1760,15 +1753,16 @@ class VPHistogramRecipe : public VPRecipeBase {
 /// instruction.
 struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
                                                public VPIRMetadata {
-  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands)
+  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands,
+                      const VPIRMetadata &MD = {})
       : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I),
-        VPIRMetadata(I) {}
+        VPIRMetadata(MD) {}
 
   ~VPWidenSelectRecipe() override = default;
 
   VPWidenSelectRecipe *clone() override {
     return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
-                                   operands());
+                                   operands(), *this);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index aed85271350c8..612202d049774 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 
 #define DEBUG_TYPE "vplan"
 
@@ -37,6 +38,9 @@ class PlainCFGBuilder {
   // Loop Info analysis.
   LoopInfo *LI;
 
+  // Loop versioning for alias metadata.
+  LoopVersioning *LVer;
+
   // Vectorization plan that we are working on.
   std::unique_ptr<VPlan> Plan;
 
@@ -65,8 +69,8 @@ class PlainCFGBuilder {
   void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
 
 public:
-  PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
-      : TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
+  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer)
+      : TheLoop(Lp), LI(LI), LVer(LVer), Plan(std::make_unique<VPlan>(Lp)) {}
 
   /// Build plain CFG for TheLoop and connect it to Plan's entry.
   std::unique_ptr<VPlan> buildPlainCFG();
@@ -186,7 +190,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       // recipes.
       if (Br->isConditional()) {
         VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst,
+                                 VPIRMetadata(*Inst), Inst->getDebugLoc());
       }
 
       // Skip the rest of the Instruction processing for Branch instructions.
@@ -200,7 +205,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
       for (auto Case : SI->cases())
         Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
-      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst,
+                               VPIRMetadata(*Inst), Inst->getDebugLoc());
       continue;
     }
 
@@ -228,6 +234,18 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
               VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
       }
     } else {
+      // Build VPIRMetadata from the instruction and add loop versioning
+      // metadata for loads and stores.
+      VPIRMetadata MD(*Inst);
+      if (isa<LoadInst, StoreInst>(Inst) && LVer) {
+        const auto &[AliasScopeMD, NoAliasMD] =
+            LVer->getNoAliasMetadataFor(Inst);
+        if (AliasScopeMD)
+          MD.setMetadata(LLVMContext::MD_alias_scope, AliasScopeMD);
+        if (NoAliasMD)
+          MD.setMetadata(LLVMContext::MD_noalias, NoAliasMD);
+      }
+
       // Translate LLVM-IR operands into VPValue operands and set them in the
       // new VPInstruction.
       SmallVector<VPValue *, 4> VPOperands;
@@ -236,12 +254,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
 
       if (auto *CI = dyn_cast<CastInst>(Inst)) {
         NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0],
-                                            CI->getType(), CI->getDebugLoc());
+                                            CI->getType(), CI->getDebugLoc(),
+                                            {}, MD);
         NewR->setUnderlyingValue(CI);
       } else {
         // Build VPInstruction for any arbitrary Instruction without specific
         // representation in VPlan.
-        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst);
+        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst, MD,
+                                        Inst->getDebugLoc());
       }
     }
 
@@ -537,8 +557,9 @@ static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
 
 std::unique_ptr<VPlan>
 VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
-                             DebugLoc IVDL, PredicatedScalarEvolution &PSE) {
-  PlainCFGBuilder Builder(TheLoop, &LI);
+                             DebugLoc IVDL, PredicatedScalarEvolution &PSE,
+                             LoopVersioning *LVer) {
+  PlainCFGBuilder Builder(TheLoop, &LI, LVer);
   std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
   addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop);
   return VPlan0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e2a8e495d5ed5..fca6554ad77c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <cassert>
 
 using namespace llvm;
@@ -1674,17 +1673,6 @@ void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
-    : VPIRMetadata(I) {
-  if (!LVer || !isa<LoadInst, StoreInst>(&I))
-    return;
-  const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
-  if (AliasScopeMD)
-    Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
-  if (NoAliasMD)
-    Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
-}
-
 void VPIRMetadata::applyMetadata(Instruction &I) const {
   for (const auto &[Kind, Node] : Metadata)
     I.setMetadata(Kind, Node);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 1453c6623625b..3b5cc9fcb9820 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -517,7 +517,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
 
   assert(CombinedOperands.size() > 0 && "Need more some operands");
   auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
-  auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
+  auto *VPI =
+      new VPInstruction(Opcode, CombinedOperands, {}, {}, Inst->getDebugLoc());
 
   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
                     << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3e2c47e4556a6..89118b49bed44 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -85,20 +85,19 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
               Ingredient.getDebugLoc());
         }
       } else {
-        assert(isa<VPInstruction>(&Ingredient) &&
-               "only VPInstructions expected here");
+        auto *VPI = cast<VPInstruction>(&Ingredient);
         assert(!isa<PHINode>(Inst) && "phis should be handled above");
         // Create VPWidenMemoryRecipe for loads and stores.
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+              false /*Consecutive*/, false /*Reverse*/, *VPI,
               Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
-              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              VPIRMetadata(*Store), Ingredient.getDebugLoc());
+              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
+              Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -107,15 +106,17 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
             return false;
           NewRecipe = new VPWidenIntrinsicRecipe(
               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
-              drop_end(Ingredient.operands()), CI->getType(),
+              drop_end(Ingredient.operands()), CI->getType(), *VPI,
               CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
-          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
+          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands(), *VPI);
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
-          NewRecipe = new VPWidenCastRecipe(
-              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
+          NewRecipe =
+              new VPWidenCastRecipe(CI->getOpcode(), Ingredient.getOperand(0),
+                                    CI->getType(), *CI, *VPI);
         } else {
-          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
+          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
+                                        Ingredient.getDebugLoc());
         }
       }
 
@@ -1705,8 +1706,9 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
       Ops.append({ALM, Plan.getOrAddLiveIn(
                            ConstantInt::get(IntegerType::getInt64Ty(Ctx),
                                             VF.getKnownMinValue() * Part))});
-      auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
-                                             IntegerType::getInt1Ty(Ctx), DL);
+      auto *Ext =
+          new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
+                                     IntegerType::getInt1Ty(Ctx), {}, {}, DL);
       Extracts[Part] = Ext;
       Ext->insertAfter(ALM);
     }
@@ -1845,7 +1847,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
     // The vector region contains header phis for which we cannot remove the
     // loop region yet.
     auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
-                                  Term->getDebugLoc());
+                                  {}, {}, Term->getDebugLoc());
     ExitingVPBB->appendRecipe(BOC);
   }
 
@@ -2679,13 +2681,13 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
             m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
         Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
-        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+        TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
 
   if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
                                  m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
         Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
-        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+        TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
 
   return nullptr;
 }
@@ -2753,7 +2755,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
             {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
-            TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
+            TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
+            R.getDebugLoc());
         VPSplice->insertBefore(&R);
         R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
         ToErase.push_back(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e3bde8a47dcbc..a44a4f69c917b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class InductionDescriptor;
 class Instruction;
+class LoopVersioning;
 class PHINode;
 class ScalarEvolution;
 class PredicatedScalarEvolution;
@@ -99,7 +100,7 @@ struct VPlanTransforms {
   ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
   LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
   buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
-              PredicatedScalarEvolution &PSE);
+              PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
 
   /// Update \p Plan to account for all early exits.
   LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index ee7fa175ca918..0e76c64f09f59 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1009,7 +1009,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   SmallVector<VPValue *, 2> Args;
   Args.push_back(Op1);
   Args.push_back(Op2);
-  VPWidenRecipe WidenR(*AI, make_range(Args.begin(), Args.end()));
+  VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc());
 
   checkVPRecipeCastImpl<VPWidenRecipe, VPUser>(&WidenR);
   delete AI;
@@ -1092,7 +1092,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
   IntegerType *Int64 = IntegerType::get(C, 64);
   auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
-  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast);
+  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {});
 
   checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser>(&Recipe);
   delete Cast;
@@ -1263,7 +1263,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     SmallVector<VPValue *, 2> Args;
     Args.push_back(Op1);
     Args.push_back(Op2);
-    VPWidenRecipe Recipe(*AI, make_range(Args.begin(), Args.end()));
+    VPWidenRecipe Recipe(*AI, Args, VPIRMetadata(), DebugLoc());
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1468,7 +1468,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   Args.push_back(ExtVPV1);
   Args.push_back(ExtVPV2);
   VPWidenRecipe *WidenR =
-      new VPWidenRecipe(*AI, make_range(Args.begin(), Args.end()));
+      new VPWidenRecipe(*AI, Args, VPIRMetadata(), DebugLoc());
   VPBB1->appendRecipe(WidenR);
 
   {

From 92c8c87c49100e3f14e3ec46abf47f27191f8b53 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Mon, 17 Nov 2025 13:42:12 -0800
Subject: [PATCH 091/105] [libc] Implement wcstod and wcstold. (#168020)

These are simply implemented as specializations of strtofloatingpoint
for double / long double and for wchar_t. The unit tests are copied from
the strtod / strtold ones.
---
 libc/config/linux/x86_64/entrypoints.txt |   2 +
 libc/include/wchar.yaml                  |  14 +
 libc/src/wchar/CMakeLists.txt            |  22 +
 libc/src/wchar/wcstod.cpp                |  30 ++
 libc/src/wchar/wcstod.h                  |  20 +
 libc/src/wchar/wcstold.cpp               |  30 ++
 libc/src/wchar/wcstold.h                 |  21 +
 libc/test/src/wchar/CMakeLists.txt       |  29 +-
 libc/test/src/wchar/wcstod_test.cpp      | 586 +++++++++++++++++++++++
 libc/test/src/wchar/wcstold_test.cpp     | 262 ++++++++++
 10 files changed, 1015 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/wchar/wcstod.cpp
 create mode 100644 libc/src/wchar/wcstod.h
 create mode 100644 libc/src/wchar/wcstold.cpp
 create mode 100644 libc/src/wchar/wcstold.h
 create mode 100644 libc/test/src/wchar/wcstod_test.cpp
 create mode 100644 libc/test/src/wchar/wcstold_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index d3bcad470b3e1..5036c9438a503 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -398,9 +398,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wmemchr
     libc.src.wchar.wcpcpy
     libc.src.wchar.wcpncpy
+    libc.src.wchar.wcstod
     libc.src.wchar.wcstof
     libc.src.wchar.wcstok
     libc.src.wchar.wcstol
+    libc.src.wchar.wcstold
     libc.src.wchar.wcstoll
     libc.src.wchar.wcstoul
     libc.src.wchar.wcstoull
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index faceb9bb4e12d..a524c7f56bed0 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -367,3 +367,17 @@ functions:
     arguments:
       - type: const wchar_t *__restrict
       - type: wchar_t **__restrict
+  - name: wcstod
+    standards:
+      - stdc
+    return_type: double
+    arguments:
+      - type: const wchar_t *__restrict
+      - type: wchar_t **__restrict
+  - name: wcstold
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: const wchar_t *__restrict
+      - type: wchar_t **__restrict
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index e3fac9fb80529..e6d9af9eacf73 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -110,6 +110,28 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  wcstod
+  SRCS
+    wcstod.cpp
+  HDRS
+    wcstod.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
+  wcstold
+  SRCS
+    wcstold.cpp
+  HDRS
+    wcstold.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   wcstok
   SRCS
diff --git a/libc/src/wchar/wcstod.cpp b/libc/src/wchar/wcstod.cpp
new file mode 100644
index 0000000000000..95351c304c0ff
--- /dev/null
+++ b/libc/src/wchar/wcstod.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of wcstod ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstod.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, wcstod,
+                   (const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end)) {
+  auto result = internal::strtofloatingpoint<double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<wchar_t *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcstod.h b/libc/src/wchar/wcstod.h
new file mode 100644
index 0000000000000..ff397b93d405d
--- /dev/null
+++ b/libc/src/wchar/wcstod.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for wcstod ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOD_H
+#define LLVM_LIBC_SRC_WCHAR_WCSTOD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double wcstod(const wchar_t *__restrict str, wchar_t **__restrict str_end);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCSTOD_H
diff --git a/libc/src/wchar/wcstold.cpp b/libc/src/wchar/wcstold.cpp
new file mode 100644
index 0000000000000..ffbc3f248b883
--- /dev/null
+++ b/libc/src/wchar/wcstold.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of wcstold -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstold.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long double, wcstold,
+                   (const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end)) {
+  auto result = internal::strtofloatingpoint<long double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<wchar_t *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcstold.h b/libc/src/wchar/wcstold.h
new file mode 100644
index 0000000000000..1525362b33571
--- /dev/null
+++ b/libc/src/wchar/wcstold.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for wcstold -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
+#define LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long double wcstold(const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 122cad2575327..a62a30fe00124 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -538,5 +538,32 @@ add_libc_test(
   DEPENDS
     libc.src.wchar.wcstof
     libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.LibcFPTestHelpers
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+)
+
+add_libc_test(
+  wcstod_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcstod_test.cpp
+  DEPENDS
+    libc.src.wchar.wcstod
+    libc.test.UnitTest.ErrnoCheckingTest
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+)
+
+add_libc_test(
+  wcstold_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcstold_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.uint128
+    libc.src.wchar.wcstold
+    libc.test.UnitTest.ErrnoCheckingTest
 )
diff --git a/libc/test/src/wchar/wcstod_test.cpp b/libc/test/src/wchar/wcstod_test.cpp
new file mode 100644
index 0000000000000..0c2b82cfba898
--- /dev/null
+++ b/libc/test/src/wchar/wcstod_test.cpp
@@ -0,0 +1,586 @@
+//===-- Unittests for wcstod ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstod.h"
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/RoundingModeUtils.h"
+#include "test/UnitTest/Test.h"
+
+#include <stddef.h>
+
+using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest;
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+class LlvmLibcWcstodTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
+                           ForceRoundingModeTest<RoundingMode::Nearest> {
+public:
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const uint64_t expectedRawData, const int expectedErrno = 0) {
+    // expectedRawData is the expected double result as a uint64_t, organized
+    // according to IEEE754:
+    //
+    // +-- 1 Sign Bit                        +-- 52 Mantissa bits
+    // |                                     |
+    // |           +-------------------------+------------------------+
+    // |           |                                                  |
+    // SEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+    //  |         |
+    //  +----+----+
+    //       |
+    //       +-- 11 Exponent Bits
+    //
+    //  This is so that the result can be compared in parts.
+    wchar_t *str_end = nullptr;
+
+    LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
+        LIBC_NAMESPACE::fputil::FPBits<double>(expectedRawData);
+
+    double result = LIBC_NAMESPACE::wcstod(inputString, &str_end);
+    if (expectedErrno == 0)
+      EXPECT_THAT(result, Succeeds<double>(expected_fp.get_val()));
+    else
+      EXPECT_THAT(result, Fails<double>(expectedErrno, expected_fp.get_val()));
+    EXPECT_EQ(str_end - inputString, expectedStrLen);
+  }
+};
+
+TEST_F(LlvmLibcWcstodTest, SimpleTest) {
+  run_test(L"123", 3, uint64_t(0x405ec00000000000));
+
+  // This should fail on Eisel-Lemire, forcing a fallback to simple decimal
+  // conversion.
+  run_test(L"12345678901234549760", 20, uint64_t(0x43e56a95319d63d8));
+
+  // Found while looking for difficult test cases here:
+  // https://github.com/nigeltao/parse-number-fxx-test-data/blob/main/more-test-cases/golang-org-issue-36657.txt
+  run_test(L"1090544144181609348835077142190", 31,
+           uint64_t(0x462b8779f2474dfb));
+
+  run_test(L"0x123", 5, uint64_t(0x4072300000000000));
+}
+
+// These are tests that have caused problems in the past.
+TEST_F(LlvmLibcWcstodTest, SpecificFailures) {
+  run_test(L"3E70000000000000", 16, uint64_t(0x7FF0000000000000), ERANGE);
+  run_test(L"358416272e-33", 13, uint64_t(0x3adbbb2a68c9d0b9));
+  run_test(L"2.16656806400000023841857910156251e9", 36,
+           uint64_t(0x41e0246690000001));
+  run_test(L"27949676547093071875", 20, uint64_t(0x43f83e132bc608c9));
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000e-800",
+      806, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000e-799",
+      806, 0x4024000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "00000000000e-800",
+      807, 0x4024000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000e-64",
+      69, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000000000000000000000000000000000000000000000000000e-128",
+      134, 0x3ff0000000000000);
+  run_test(L"100000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "000000000000000000000000000000000000000000000000000000000e-256",
+           262, 0x3ff0000000000000);
+  run_test(L"100000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "000000000000000000000000000000000000000000000e-512",
+           518, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000e-1024",
+      1031, 0x3ff0000000000000);
+  run_test(
+      L"0"
+      "100000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "00000000000000000e-1024",
+      1032, 0x3ff0000000000000);
+}
+
+TEST_F(LlvmLibcWcstodTest, FuzzFailures) {
+  run_test(L"-\xff\xff\xff\xff\xff\xff\xff\x01", 0, uint64_t(0));
+  run_test(L"-.????", 0, uint64_t(0));
+  run_test(
+      L"44444444444444444444444444444444444444444444444444A44444444444444444"
+      "44444444444*\x99\xff\xff\xff\xff",
+      50, uint64_t(0x4a3e68fdd0e0b2d8));
+  run_test(L"-NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNKNNNNNNNNNNNNNNNNNN?"
+           "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN?",
+           0, uint64_t(0));
+  run_test(L"0x.666E40", 9, uint64_t(0x3fd99b9000000000));
+
+  // glibc version 2.36 and higher (not tested with lower versions) disagrees
+  // with this result, but ours is correct for the nearest rounding mode. See
+  // this bug: https://sourceware.org/bugzilla/show_bug.cgi?id=30220
+  run_test(L"0x30000002222225p-1077", 22, uint64_t(0x0006000000444445), ERANGE);
+
+  // This value triggered a bug by having an exponent exactly equal to the
+  // maximum. The overflow checks would accept a value less than the max value
+  // as valid and greater than the max value as invalid (and set it to the max),
+  // but an exponent of exactly max value hit the else condition which is
+  // intended for underflow and set the exponent to the min exponent.
+  run_test(
+      L"18477446000000000000000000000000000005230000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000935166201543003765631683711878842"
+      "388777446000000000000430037600000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000005238581124701719460000000"
+      "000000000017194600000000000000000070046000000000000000000000000100000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000002000000000000000"
+      "000000000000056316837118788423887774460000000000000000000000000000052385"
+      "811247017194600000000000000000171946000000000000000000700460000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000002000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000523858112470171946000000"
+      "000000000001719460000000000000000007004600000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0200000000000000000E608",
+      1462, uint64_t(0x7ff0000000000000), ERANGE);
+
+  // Same as above but for hex.
+  run_test(L"0x0164810157p2047", 17, uint64_t(0x7ff0000000000000), ERANGE);
+
+  // This test ensures that only the correct number of characters is accepted.
+  // An exponent symbol followed by a sign isn't a valid exponent.
+  run_test(L"2e+", 1, uint64_t(0x4000000000000000));
+  run_test(L"0x2p+", 3, uint64_t(0x4000000000000000));
+
+  // This bug was in the handling of very large exponents in the exponent
+  // marker. Previously anything greater than 10,000 would be set to 10,000.
+  // This caused incorrect behavior if there were more than 10,000 '0's in the
+  // input number, and then a correspondingly large exponent. This test case has
+  // 24,744 zeroes.
+  run_test(
+      L"0x."
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000fp551615",
+      24755, uint64_t(0x7ff0000000000000), ERANGE);
+}
diff --git a/libc/test/src/wchar/wcstold_test.cpp b/libc/test/src/wchar/wcstold_test.cpp
new file mode 100644
index 0000000000000..3a7fdfce3e732
--- /dev/null
+++ b/libc/test/src/wchar/wcstold_test.cpp
@@ -0,0 +1,262 @@
+//===-- Unittests for wcstold ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/uint128.h"
+#include "src/wchar/wcstold.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+#include <stddef.h>
+
+#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
+#define SELECT_CONST(val, _, __) val
+#elif defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80)
+#define SELECT_CONST(_, val, __) val
+#elif defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT128)
+#define SELECT_CONST(_, __, val) val
+#else
+#error "Unknown long double type"
+#endif
+
+class LlvmLibcWcstoldTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+public:
+#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const uint64_t expectedRawData, const int expectedErrno = 0)
+#else
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const UInt128 expectedRawData, const int expectedErrno = 0)
+#endif
+  {
+    // expectedRawData64 is the expected long double result as a uint64_t,
+    // organized according to the IEEE754 double precision format:
+    //
+    // +-- 1 Sign Bit                        +-- 52 Mantissa bits
+    // |                                     |
+    // |           +-------------------------+------------------------+
+    // |           |                                                  |
+    // SEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+    //  |         |
+    //  +----+----+
+    //       |
+    //       +-- 11 Exponent Bits
+
+    // expectedRawData80 is the expected long double result as a UInt128,
+    // organized according to the x86 extended precision format:
+    //
+    // +-- 1 Sign Bit
+    // |
+    // |               +-- 1 Integer part bit (1 unless this is a subnormal)
+    // |               |
+    // SEEEEEEEEEEEEEEEIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...M
+    //  |             | |                                                      |
+    //  +------+------+ +---------------------------+--------------------------+
+    //         |                                    |
+    //         +-- 15 Exponent Bits                 +-- 63 Mantissa bits
+
+    // expectedRawData128 is the expected long double result as a UInt128,
+    // organized according to IEEE754 quadruple precision format:
+    //
+    // +-- 1 Sign Bit                               +-- 112 Mantissa bits
+    // |                                            |
+    // |               +----------------------------+--------------------------+
+    // |               |                                                       |
+    // SEEEEEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...M
+    //  |             |
+    //  +------+------+
+    //         |
+    //         +-- 15 Exponent Bits
+    wchar_t *str_end = nullptr;
+
+    using FPBits = LIBC_NAMESPACE::fputil::FPBits<long double>;
+    FPBits expected_fp =
+        FPBits(static_cast<FPBits::StorageType>(expectedRawData));
+    const int expected_errno = expectedErrno;
+
+    long double result = LIBC_NAMESPACE::wcstold(inputString, &str_end);
+
+    LIBC_NAMESPACE::fputil::FPBits<long double> actual_fp =
+        LIBC_NAMESPACE::fputil::FPBits<long double>();
+    actual_fp = LIBC_NAMESPACE::fputil::FPBits<long double>(result);
+
+    EXPECT_EQ(str_end - inputString, expectedStrLen);
+
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
+    EXPECT_EQ(actual_fp.is_neg(), expected_fp.is_neg());
+    EXPECT_EQ(actual_fp.get_exponent(), expected_fp.get_exponent());
+    EXPECT_EQ(actual_fp.get_mantissa(), expected_fp.get_mantissa());
+    ASSERT_ERRNO_EQ(expected_errno);
+  }
+};
+
+TEST_F(LlvmLibcWcstoldTest, SimpleTest) {
+  run_test(L"123", 3,
+           SELECT_CONST(uint64_t(0x405ec00000000000),
+                        UInt128(0x4005f60000) << 40,
+                        UInt128(0x4005ec0000000000) << 64));
+
+  // This should fail on Eisel-Lemire, forcing a fallback to simple decimal
+  // conversion.
+  run_test(L"12345678901234549760", 20,
+           SELECT_CONST(uint64_t(0x43e56a95319d63d8),
+                        (UInt128(0x403eab54a9) << 40) + UInt128(0x8ceb1ec400),
+                        (UInt128(0x403e56a95319d63d) << 64) +
+                            UInt128(0x8800000000000000)));
+
+  // Found while looking for difficult test cases here:
+  // https://github.com/nigeltao/parse-number-fxx-test-data/blob/main/more-test-cases/golang-org-issue-36657.txt
+  run_test(L"1090544144181609348835077142190", 31,
+           SELECT_CONST(uint64_t(0x462b8779f2474dfb),
+                        (UInt128(0x4062dc3bcf) << 40) + UInt128(0x923a6fd402),
+                        (UInt128(0x4062b8779f2474df) << 64) +
+                            UInt128(0xa804bfd8c6d5c000)));
+
+  run_test(L"0x123", 5,
+           SELECT_CONST(uint64_t(0x4072300000000000),
+                        (UInt128(0x4007918000) << 40),
+                        (UInt128(0x4007230000000000) << 64)));
+}
+
+// These are tests that have caused problems for doubles in the past.
+TEST_F(LlvmLibcWcstoldTest, Float64SpecificFailures) {
+  run_test(L"3E70000000000000", 16,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)),
+           ERANGE);
+  run_test(L"358416272e-33", 13,
+           SELECT_CONST(uint64_t(0x3adbbb2a68c9d0b9),
+                        (UInt128(0x3fadddd953) << 40) + UInt128(0x464e85c400),
+                        (UInt128(0x3fadbbb2a68c9d0b) << 64) +
+                            UInt128(0x8800e7969e1c5fc8)));
+  run_test(L"2.16656806400000023841857910156251e9", 36,
+           SELECT_CONST(uint64_t(0x41e0246690000001),
+                        (UInt128(0x401e812334) << 40) + UInt128(0x8000000400),
+                        (UInt128(0x401e024669000000) << 64) +
+                            UInt128(0x800000000000018)));
+  run_test(L"27949676547093071875", 20,
+           SELECT_CONST(uint64_t(0x43f83e132bc608c9),
+                        (UInt128(0x403fc1f099) << 40) + UInt128(0x5e30464402),
+                        (UInt128(0x403f83e132bc608c) << 64) +
+                            UInt128(0x8803000000000000)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, Float80SpecificFailures) {
+  run_test(L"777777777777777777777777777777777777777777777777777777777777777777"
+           "7777777777777777777777777777777777",
+           100,
+           SELECT_CONST(uint64_t(0x54ac729b8fcaf734),
+                        (UInt128(0x414ae394dc) << 40) + UInt128(0x7e57b9a0c2),
+                        (UInt128(0x414ac729b8fcaf73) << 64) +
+                            UInt128(0x4184a3d793224129)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, MaxSizeNumbers) {
+  run_test(L"1.1897314953572317650e4932", 26,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7ffeffffff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7ffeffffffffffff) << 64) +
+                            UInt128(0xfffd57322e3f8675)),
+           SELECT_CONST(ERANGE, 0, 0));
+  run_test(L"1.18973149535723176508e4932", 27,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7ffeffffffffffff) << 64) +
+                            UInt128(0xffffd2478338036c)),
+           SELECT_CONST(ERANGE, ERANGE, 0));
+}
+
+// These tests check subnormal behavior for 80 bit and 128 bit floats. They will
+// be too small for 64 bit floats.
+TEST_F(LlvmLibcWcstoldTest, SubnormalTests) {
+  run_test(L"1e-4950", 7,
+           SELECT_CONST(uint64_t(0), (UInt128(0x00000000000000000003)),
+                        (UInt128(0x000000000000000000057c9647e1a018))),
+           ERANGE);
+  run_test(L"1.89e-4951", 10,
+           SELECT_CONST(uint64_t(0), (UInt128(0x00000000000000000001)),
+                        (UInt128(0x0000000000000000000109778a006738))),
+           ERANGE);
+  run_test(L"4e-4966", 7,
+           SELECT_CONST(uint64_t(0), (UInt128(0)),
+                        (UInt128(0x00000000000000000000000000000001))),
+           ERANGE);
+}
+
+TEST_F(LlvmLibcWcstoldTest, SmallNormalTests) {
+  run_test(L"3.37e-4932", 10,
+           SELECT_CONST(
+               uint64_t(0), (UInt128(0x1804cf7) << 40) + UInt128(0x908850712),
+               (UInt128(0x10099ee12110a) << 64) + UInt128(0xe24b75c0f50dc0c)),
+           SELECT_CONST(ERANGE, 0, 0));
+}
+
+TEST_F(LlvmLibcWcstoldTest, ComplexHexadecimalTests) {
+  run_test(L"0x1p16383", 9,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7ffe800000) << 40),
+                        (UInt128(0x7ffe000000000000) << 64)),
+           SELECT_CONST(ERANGE, 0, 0));
+  run_test(L"0x123456789abcdef", 17,
+           SELECT_CONST(0x43723456789abcdf,
+                        (UInt128(0x403791a2b3) << 40) + UInt128(0xc4d5e6f780),
+                        (UInt128(0x403723456789abcd) << 64) +
+                            UInt128(0xef00000000000000)));
+  run_test(L"0x123456789abcdef0123456789ABCDEF", 33,
+           SELECT_CONST(0x47723456789abcdf,
+                        (UInt128(0x407791a2b3) << 40) + UInt128(0xc4d5e6f781),
+                        (UInt128(0x407723456789abcd) << 64) +
+                            UInt128(0xef0123456789abce)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, InfTests) {
+  run_test(L"INF", 3,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)));
+  run_test(L"INFinity", 8,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)));
+  run_test(L"-inf", 4,
+           SELECT_CONST(0xfff0000000000000, (UInt128(0xffff800000) << 40),
+                        (UInt128(0xffff000000000000) << 64)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, NaNTests) {
+  run_test(L"NaN", 3,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+  run_test(L"-nAn", 4,
+           SELECT_CONST(0xfff8000000000000, (UInt128(0xffffc00000) << 40),
+                        (UInt128(0xffff800000000000) << 64)));
+  run_test(L"NaN()", 5,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+  run_test(L"NaN(1234)", 9,
+           SELECT_CONST(0x7ff80000000004d2,
+                        (UInt128(0x7fffc00000) << 40) + UInt128(0x4d2),
+                        (UInt128(0x7fff800000000000) << 64) + UInt128(0x4d2)));
+  run_test(L"NaN(0xffffffffffff)", 19,
+           SELECT_CONST(0x7ff8ffffffffffff,
+                        (UInt128(0x7fffc000ff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xffffffffffff)));
+  run_test(L"NaN(0xfffffffffffff)", 20,
+           SELECT_CONST(0x7fffffffffffffff,
+                        (UInt128(0x7fffc00fff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xfffffffffffff)));
+  run_test(L"NaN(0xffffffffffffffff)", 23,
+           SELECT_CONST(0x7fffffffffffffff,
+                        (UInt128(0x7fffffffff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xffffffffffffffff)));
+  run_test(L"NaN( 1234)", 3,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+}

From 3f60d220514c4be00e548a17a85c2fa8fa89cc35 Mon Sep 17 00:00:00 2001
From: Ziqing Luo <ziqing_luo@apple.com>
Date: Mon, 17 Nov 2025 13:43:53 -0800
Subject: [PATCH 092/105] [-Wunsafe-buffer-usage] Fold the expression "cond ?
 E1 : E2" when checking safe patterns, if "cond" is a constant (#167989)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In `-Wunsafe-buffer-usage`, many safe pattern checks can benefit from
constant folding. This commit improves null-terminated pointer checks by
folding conditional expressions.

rdar://159374822

---------

Co-authored-by: Balázs Benics <benicsbalazs@gmail.com>
---
 clang/lib/Analysis/UnsafeBufferUsage.cpp      | 26 +++++++++++++---
 ...n-unsafe-buffer-usage-fold-conditional.cpp | 31 +++++++++++++++++++
 2 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index f5a368636c43d..da155d31d4a88 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -781,9 +781,25 @@ struct LibcFunNamePrefixSuffixParser {
   }
 };
 
+// Constant fold a conditional expression 'cond ? A : B' to
+// - 'A', if 'cond' has constant true value;
+// - 'B', if 'cond' has constant false value.
+static const Expr *tryConstantFoldConditionalExpr(const Expr *E,
+                                                  const ASTContext &Ctx) {
+  // FIXME: more places can use this function
+  if (const auto *CE = dyn_cast<ConditionalOperator>(E)) {
+    bool CondEval;
+
+    if (CE->getCond()->EvaluateAsBooleanCondition(CondEval, Ctx))
+      return CondEval ? CE->getLHS() : CE->getRHS();
+  }
+  return E;
+}
+
 // A pointer type expression is known to be null-terminated, if it has the
 // form: E.c_str(), for any expression E of `std::string` type.
-static bool isNullTermPointer(const Expr *Ptr) {
+static bool isNullTermPointer(const Expr *Ptr, ASTContext &Ctx) {
+  Ptr = tryConstantFoldConditionalExpr(Ptr, Ctx);
   if (isa<clang::StringLiteral>(Ptr->IgnoreParenImpCasts()))
     return true;
   if (isa<PredefinedExpr>(Ptr->IgnoreParenImpCasts()))
@@ -874,7 +890,7 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
 
       const Expr *Arg = Call->getArg(ArgIdx);
 
-      if (isNullTermPointer(Arg))
+      if (isNullTermPointer(Arg, Ctx))
         // If Arg is a null-terminated pointer, it is safe anyway.
         return true; // continue parsing
 
@@ -922,8 +938,8 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
   // (including the format argument) is unsafe pointer.
   return llvm::any_of(
       llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()),
-      [&UnsafeArg](const Expr *Arg) -> bool {
-        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+      [&UnsafeArg, &Ctx](const Expr *Arg) -> bool {
+        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg, Ctx)) {
           UnsafeArg = Arg;
           return true;
         }
@@ -1175,7 +1191,7 @@ static bool hasUnsafePrintfStringArg(const CallExpr &Node, ASTContext &Ctx,
   // We don't really recognize this "normal" printf, the only thing we
   // can do is to require all pointers to be null-terminated:
   for (const auto *Arg : Node.arguments())
-    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg, Ctx)) {
       Result.addNode(Tag, DynTypedNode::create(*Arg));
       return true;
     }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp
new file mode 100644
index 0000000000000..b4f30b533bc4b
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -fsyntax-only -Wno-all -Wunsafe-buffer-usage -verify %s -std=c++20
+// RUN: %clang_cc1 -fsyntax-only -Wno-all -Wunsafe-buffer-usage -verify %s -x c
+// expected-no-diagnostics
+
+typedef struct {} FILE;
+int fprintf( FILE* stream, const char* format, ... );
+FILE * stderr;
+
+#define DEBUG_ASSERT_MESSAGE(name, assertion, label, message, file, line, value) \
+  fprintf(stderr, "AssertMacros: %s, %s file: %s, line: %d, value: %lld\n",      \
+          assertion, (message!=0) ? message : "", file, line, (long long) (value));
+
+
+#define Require(assertion, exceptionLabel)                              \
+  do                                                                    \
+    {                                                                   \
+      if ( __builtin_expect(!(assertion), 0) ) {                        \
+        DEBUG_ASSERT_MESSAGE(                                           \
+	  "DEBUG_ASSERT_COMPONENT_NAME_STRING",                         \
+	  #assertion, #exceptionLabel, 0, __FILE__, __LINE__,  0);      \
+	goto exceptionLabel;                                            \
+      }									\
+    } while ( 0 )
+
+
+void f(int x, int y) {
+  Require(x == y, L1);
+ L1:
+  return;
+}
+

From 909c9aacead077b14e2bff123d09641d08939fe5 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@amd.com>
Date: Mon, 17 Nov 2025 16:51:52 -0500
Subject: [PATCH 093/105] [mlir][amdgpu] Add lowerings for ScaledExtPacked816
 (#168123)

* Adds lowerings for amdgpy.scaled_ext_packed816
* updates verifiers
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 191 +++++++++++++++++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |  45 +++--
 .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir        |   1 +
 .../AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir   | 164 +++++++++++++++
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  32 ---
 5 files changed, 379 insertions(+), 54 deletions(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a5831559558ac..edc6565f44f00 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -43,6 +43,7 @@ constexpr Chipset kGfx908 = Chipset(9, 0, 8);
 constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
 constexpr Chipset kGfx942 = Chipset(9, 4, 2);
 constexpr Chipset kGfx950 = Chipset(9, 5, 0);
+constexpr Chipset kGfx1250 = Chipset(12, 5, 0);
 
 /// Convert an unsigned number `val` to i32.
 static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter,
@@ -1149,7 +1150,7 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
                                  k, isRDNA3);
 
   // Handle gfx1250.
-  if (chipset == Chipset{12, 5, 0})
+  if (chipset == kGfx1250)
     return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType,
                                     elemDestType, k);
 
@@ -1300,7 +1301,7 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
     if (chipset.majorVersion != 11 && chipset.majorVersion != 12)
       return op->emitOpError("WMMA only supported on gfx11 and gfx12");
 
-    bool isGFX1250 = chipset >= Chipset(12, 5, 0);
+    bool isGFX1250 = chipset >= kGfx1250;
 
     // The WMMA operations represent vectors of bf16s as vectors of i16s
     // (except on gfx1250), so we need to bitcast bfloats to i16 and then
@@ -1505,6 +1506,19 @@ struct ExtPackedFp8OpLowering final
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+struct ScaledExtPacked816OpLowering final
+    : public ConvertOpToLLVMPattern<ScaledExtPacked816Op> {
+  ScaledExtPacked816OpLowering(const LLVMTypeConverter &converter,
+                               Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::ScaledExtPacked816Op>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 struct PackedTrunc2xFp8OpLowering final
     : public ConvertOpToLLVMPattern<PackedTrunc2xFp8Op> {
   PackedTrunc2xFp8OpLowering(const LLVMTypeConverter &converter,
@@ -1613,6 +1627,170 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
   return success();
 }
 
+int32_t getScaleSel(int32_t blockSize, unsigned bitWidth,
+                    int32_t firstScaleLane, int32_t firstScaleByte) {
+  // When lowering amdgpu.scaled_ext_packed816 to rocdl.cvt.scale.pk*.f*.f*
+  // operations, the attributes blockSize, sourceType, firstScaleLane and
+  // firstScaleByte are merged into a single attribute scaleSel. This is how
+  // those values are merged together.
+  assert(llvm::is_contained({16, 32}, blockSize));
+  assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));
+
+  const bool is_fp8 = bitWidth == 8;
+  const bool is_block_16 = blockSize == 16;
+
+  if (!is_fp8) {
+    int bit_0 = is_block_16;
+    assert(llvm::is_contained({0, 1, 2}, firstScaleByte));
+    int bit_1 = (firstScaleByte == 2) << 1;
+    assert(llvm::is_contained({0, 1}, firstScaleLane));
+    int bit_2 = firstScaleLane << 2;
+    return bit_2 | bit_1 | bit_0;
+  }
+
+  int bit_0 = is_block_16;
+  // firstScaleByte is guaranteed to be defined by two bits.
+  assert(llvm::is_contained({0, 1, 2, 3}, firstScaleByte));
+  int bit_2_and_1 = firstScaleByte << 1;
+  assert(llvm::is_contained({0, 1}, firstScaleLane));
+  int bit_3 = firstScaleLane << 3;
+  int bits = bit_3 | bit_2_and_1 | bit_0;
+  // These are invalid cases.
+  assert(!llvm::is_contained(
+      {0b0011, 0b0101, 0b0111, 0b1000, 0b1001, 0b1011, 0b1111}, bits));
+  return bits;
+}
+
+static std::optional<StringRef>
+scaledExtPacked816ToIntrinsic(Type srcElemType, Type destElemType) {
+  using fp4 = Float4E2M1FNType;
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+  using fp6 = Float6E2M3FNType;
+  using bf6 = Float6E3M2FNType;
+  if (isa<fp4>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Fp4Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Fp4Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Fp4Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<fp8>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Fp8Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Fp8Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Fp8Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<bf8>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Bf8Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Bf8Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Bf8Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<fp6>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk16F16Fp6Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk16Bf16Fp6Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk16F32Fp6Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<bf6>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk16F16Bf6Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk16Bf16Bf6Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk16F32Bf6Op::getOperationName();
+    return std::nullopt;
+  }
+  llvm_unreachable("invalid combination of element types for packed conversion "
+                   "instructions");
+}
+
+LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
+    ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  using fp4 = Float4E2M1FNType;
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+  using fp6 = Float6E2M3FNType;
+  using bf6 = Float6E3M2FNType;
+  Location loc = op.getLoc();
+  if (chipset != kGfx1250) {
+    return rewriter.notifyMatchFailure(
+        loc,
+        "Scaled fp packed conversion instructions are not available on target "
+        "architecture and their emulation is not implemented");
+  }
+  int32_t firstScaleLane = op.getFirstScaleLane();
+  int32_t firstScaleByte = op.getFirstScaleByte();
+  int32_t blockSize = op.getBlockSize();
+  auto sourceType = cast<VectorType>(op.getSource().getType());
+  auto srcElemType = cast<FloatType>(sourceType.getElementType());
+  unsigned bitWidth = srcElemType.getWidth();
+  int32_t scaleSel =
+      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
+
+  auto targetType = cast<VectorType>(op.getResult().getType());
+  auto destElemType = cast<FloatType>(targetType.getElementType());
+  IntegerType i32 = rewriter.getI32Type();
+  Value castedScale =
+      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
+
+  Value source = adaptor.getSource();
+  Type llvmResultType = typeConverter->convertType(op.getResult().getType());
+  Type packedType = nullptr;
+  if (isa<fp4>(srcElemType)) {
+    packedType = i32;
+    packedType = getTypeConverter()->convertType(packedType);
+  } else if (isa<fp8, bf8>(srcElemType)) {
+    packedType = VectorType::get(2, i32);
+    packedType = getTypeConverter()->convertType(packedType);
+  } else if (isa<fp6, bf6>(srcElemType)) {
+    packedType = VectorType::get(3, i32);
+    packedType = getTypeConverter()->convertType(packedType);
+  } else {
+    llvm_unreachable("invalid element type for packed scaled ext");
+  }
+
+  if (!packedType || !llvmResultType) {
+    return rewriter.notifyMatchFailure(op, "type conversion failed");
+  }
+
+  Value castedSource =
+      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
+
+  std::optional<StringRef> maybeIntrinsic =
+      scaledExtPacked816ToIntrinsic(srcElemType, destElemType);
+  if (!maybeIntrinsic.has_value())
+    return op.emitOpError(
+        "no intrinsic matching packed scaled conversion on the given chipset");
+
+  OperationState loweredOp(loc, *maybeIntrinsic);
+  loweredOp.addTypes({llvmResultType});
+  loweredOp.addOperands({castedSource, castedScale});
+
+  SmallVector<NamedAttribute, 1> attrs;
+  attrs.push_back(
+      NamedAttribute("scaleSel", rewriter.getI32IntegerAttr(scaleSel)));
+
+  loweredOp.addAttributes(attrs);
+  Operation *lowered = rewriter.create(loweredOp);
+  rewriter.replaceOp(op, lowered);
+
+  return success();
+}
+
 LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
     ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -2151,9 +2329,10 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
            AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
            SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
-           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
-           PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
-           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
-           TransposeLoadOpLowering, AMDGPUPermlaneLowering>(converter, chipset);
+           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering,
+           ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
+           PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
+           GatherToLDSOpLowering, TransposeLoadOpLowering,
+           AMDGPUPermlaneLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 5c35823678576..d55f3cec47c1f 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -343,28 +343,41 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 LogicalResult ScaledExtPacked816Op::verify() {
   int blockSize = getBlockSize();
-  assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+  assert(llvm::is_contained({16, 32}, blockSize) && "invalid block size");
 
   int firstScaleByte = getFirstScaleByte();
+  int firstScaleLane = getFirstScaleLane();
   auto sourceType = cast<VectorType>(getSource().getType());
   Type elementType = sourceType.getElementType();
   auto floatType = cast<FloatType>(elementType);
-  int bitWidth = floatType.getWidth();
+  unsigned bitWidth = floatType.getWidth();
 
-  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 &&
-      !llvm::is_contained({0, 1}, firstScaleByte)) {
-    return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 "
-                       "for f4 and f6.");
-  }
-  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 &&
-      !llvm::is_contained({0, 2}, firstScaleByte)) {
-    return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 "
-                       "for f4 and f6.");
-  }
-  if (bitWidth == 8 && blockSize == 16 &&
-      !llvm::is_contained({0, 2}, firstScaleByte)) {
-    return emitOpError(
-        "blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.");
+  assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));
+
+  const bool is_fp8 = bitWidth == 8;
+  const bool is_block_16 = blockSize == 16;
+
+  if (!is_fp8) {
+    if (is_block_16) {
+      if (!llvm::is_contained({0, 1}, firstScaleByte)) {
+        return emitOpError("blockSize of 16 can only have firstScaleByte be 0 "
+                           "or 1 for f4 and f6.");
+      }
+    } else {
+      if (!llvm::is_contained({0, 2}, firstScaleByte)) {
+        return emitOpError("blockSize of 32 can only have firstScaleByte be 0 "
+                           "or 2 for f4 and f6.");
+      }
+    }
+  } else {
+    if (is_block_16) {
+      bool is_valid = ((firstScaleLane == 0) && (firstScaleByte == 0)) ||
+                      ((firstScaleLane == 1) && (firstScaleByte == 2));
+      if (!is_valid) {
+        return emitOpError("blockSize of 16 can only have (firstScaleLane, "
+                           "firstScaleByte) be (0, 0) or (1, 2) for f8.");
+      }
+    }
   }
 
   return success();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 2fd3df6dcfa71..432b8876696a9 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -456,3 +456,4 @@ func.func @sched_barrier() {
   amdgpu.sched_barrier allow = <valu|all_vmem>
   func.return
 }
+
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
new file mode 100644
index 0000000000000..d2391140ce056
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics \
+// RUN: | FileCheck %s
+
+// CHECK-LABEL: @scaled_ext_packed816_fp4
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf4E2M1FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi4:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf4E2M1FN> to vector<8xi4>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.f16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.f32.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2: vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_fp8
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E4M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E4M3FN> to vector<8xi8>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f32.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_bf8
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E5M2>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E5M2> to vector<8xi8>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: %[[RES:.+]] = rocdl.cvt.scale.pk8.f16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.bf16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f32.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+
+// CHECK-LABEL: @scaled_ext_packed816_fp6
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E2M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E2M3FN> to vector<16xi6>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.bf16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f32.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_bf6
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E3M2FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_bf6(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E3M2FN> to vector<16xi6>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.bf16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f32.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have (firstScaleLane, firstScaleByte) be (0, 0) or (1, 2) for f8.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_src_elem_type(%v: vector<16xf16>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op operand #0 must be}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf16>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  return %ret0: vector<16xf16>
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf64>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op result #0 must be vector}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64>
+  return %ret0: vector<16xf64>
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 5c8cc8b67c4b3..61fdf29a78cbd 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -333,38 +333,6 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
 
 // -----
 
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
-  func.return
-}
-
-// -----
-
 func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
   // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
   %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>

From 7d0a2082bffb162f79fd739c79f2bf0b552b9007 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96@gmail.com>
Date: Tue, 18 Nov 2025 00:05:54 +0200
Subject: [PATCH 094/105] [AArch64] Treat COPY between cross-register banks as
 expensive (#167661)

The motivation is to allow passes such as MachineLICM to hoist trivial
FMOV instructions out of loops, where previously it didn't do so even
when the RHS is a constant.
On most architectures, these expensive move instructions have a latency
of 2-6 cycles, and certainly not cheap as a 0-1 cycle move.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  25 +++
 .../CodeGen/AArch64/licm-regclass-copy.mir    | 197 ++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/licm-regclass-copy.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 221812f1ebc7b..00fe8ee8b9b4d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1144,6 +1144,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
   return Is.size() <= 2;
 }
 
+// Check if a COPY instruction is cheap.
+static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
+  assert(MI.isCopy() && "Expected COPY instruction");
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+  // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
+  // typically requiring an FMOV instruction with a 2-6 cycle latency.
+  auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
+    if (Reg.isVirtual())
+      return MRI.getRegClass(Reg);
+    if (Reg.isPhysical())
+      return RI.getMinimalPhysRegClass(Reg);
+    return nullptr;
+  };
+  const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
+  const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
+  if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
+    return false;
+
+  return MI.isAsCheapAsAMove();
+}
+
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
@@ -1157,6 +1179,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   default:
     return MI.isAsCheapAsAMove();
 
+  case TargetOpcode::COPY:
+    return isCheapCopy(MI, RI);
+
   case AArch64::ADDWrs:
   case AArch64::ADDXrs:
   case AArch64::SUBWrs:
diff --git a/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
new file mode 100644
index 0000000000000..6a10df68ddc71
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
@@ -0,0 +1,197 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s
+
+# This test verifies that cross-bank copies (e.g., GPR to FPR, FPR to GPR)
+# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.
+
+--- |
+  declare void @use_float(float)
+  declare void @use_int(i32)
+
+  define void @gpr_to_fpr_virtual_copy_hoisted() {
+    ret void
+  }
+
+  define void @gpr_to_fpr_physical_copy_hoisted() {
+    ret void
+  }
+
+  define void @fpr_to_gpr_virtual_copy_hoisted() {
+    ret void
+  }
+...
+---
+name: gpr_to_fpr_virtual_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: gpr_to_fpr_virtual_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $s0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $w1
+    %1:gpr32 = COPY $w0
+    %0:gpr32 = COPY $w1
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY %0:gpr32
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
+---
+name: gpr_to_fpr_physical_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: gpr_to_fpr_physical_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $s0 = COPY [[COPY3]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0
+    %1:gpr32 = COPY $w0
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY $wzr
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
+---
+name: fpr_to_gpr_virtual_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: fpr_to_gpr_virtual_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $s0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:fpr32 = COPY $s0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $w0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $s0
+    %1:gpr32 = COPY $w0
+    %0:fpr32 = COPY $s0
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:gpr32 = COPY %0:fpr32
+    $w0 = COPY %7:gpr32
+    BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...

From 6245a4f875700594281022de6e38fba4439f5edf Mon Sep 17 00:00:00 2001
From: Ben Kallus <benjamin.p.kallus.gr@dartmouth.edu>
Date: Mon, 17 Nov 2025 22:14:08 +0000
Subject: [PATCH 095/105] Add support for the .base64 directive (#165549)

Starting in version 15, GCC emits a `.base64` directive instead of
`.string` or `.ascii` for char arrays of length `>= 3`.

See [this godbolt link](https://godbolt.org/z/ebhe3oenv) for an example.

This patch adds support for the .base64 directive to AsmParser.cpp, so
tools like `llvm-mc` can process the output of GCC more effectively.

This addresses #165499.
---
 llvm/lib/MC/MCParser/AsmParser.cpp        | 37 +++++++++++++++++++++++
 llvm/test/MC/AsmParser/directive_base64.s | 37 +++++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 llvm/test/MC/AsmParser/directive_base64.s

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 3c9ab8e108ddd..c9cce7d69cc01 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -46,6 +46,7 @@
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Base64.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -528,6 +529,7 @@ class AsmParser : public MCAsmParser {
     DK_LTO_SET_CONDITIONAL,
     DK_CFI_MTE_TAGGED_FRAME,
     DK_MEMTAG,
+    DK_BASE64,
     DK_END
   };
 
@@ -550,6 +552,7 @@ class AsmParser : public MCAsmParser {
 
   // ".ascii", ".asciz", ".string"
   bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveBase64();                  // ".base64"
   bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
   bool parseDirectiveValue(StringRef IDVal,
                            unsigned Size);       // ".byte", ".long", ...
@@ -1951,6 +1954,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     case DK_ASCIZ:
     case DK_STRING:
       return parseDirectiveAscii(IDVal, true);
+    case DK_BASE64:
+      return parseDirectiveBase64();
     case DK_BYTE:
     case DK_DC_B:
       return parseDirectiveValue(IDVal, 1);
@@ -3074,6 +3079,37 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return parseMany(parseOp);
 }
 
+/// parseDirectiveBase64:
+//    ::= .base64 "string" (, "string" )*
+bool AsmParser::parseDirectiveBase64() {
+  auto parseOp = [&]() -> bool {
+    if (checkForValidSection())
+      return true;
+
+    if (getTok().isNot(AsmToken::String)) {
+      return true;
+    }
+
+    std::vector<char> Decoded;
+    std::string const str = getTok().getStringContents().str();
+    if (check(str.empty(), "expected nonempty string")) {
+      return true;
+    }
+
+    llvm::Error e = decodeBase64(str, Decoded);
+    if (e) {
+      consumeError(std::move(e));
+      return Error(Lexer.getLoc(), "failed to base64 decode string data");
+    }
+
+    getStreamer().emitBytes(std::string(Decoded.begin(), Decoded.end()));
+    Lex();
+    return false;
+  };
+
+  return check(parseMany(parseOp), "expected string");
+}
+
 /// parseDirectiveReloc
 ///  ::= .reloc expression , identifier [ , expression ]
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
@@ -5343,6 +5379,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".asciz"] = DK_ASCIZ;
   DirectiveKindMap[".string"] = DK_STRING;
   DirectiveKindMap[".byte"] = DK_BYTE;
+  DirectiveKindMap[".base64"] = DK_BASE64;
   DirectiveKindMap[".short"] = DK_SHORT;
   DirectiveKindMap[".value"] = DK_VALUE;
   DirectiveKindMap[".2byte"] = DK_2BYTE;
diff --git a/llvm/test/MC/AsmParser/directive_base64.s b/llvm/test/MC/AsmParser/directive_base64.s
new file mode 100644
index 0000000000000..46a477eef51dc
--- /dev/null
+++ b/llvm/test/MC/AsmParser/directive_base64.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-unknown-unknown -defsym=ERR=1 -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+        .data
+# CHECK-LABEL: TEST0:
+# CHECK-NEXT: .byte 0
+TEST0:
+        .base64 "AA=="
+
+# CHECK-LABEL: TEST1:
+# CHECK-NEXT: .ascii "abcxyz"
+TEST1:
+        .base64 "YWJjeHl6"
+
+# CHECK-LABEL: TEST2:
+# CHECK-NEXT: .byte 1
+# CHECK-NEXT: .byte 2
+TEST2:
+        .base64 "AQ=="
+        .base64 "Ag=="
+
+# CHECK-LABEL: TEST3:
+# CHECK-NEXT: .byte 1
+# CHECK-NEXT: .byte 2
+TEST3:
+        .base64 "AQ==", "Ag=="
+
+.ifdef ERR
+# CHECK-ERROR: [[#@LINE+1]]:17: error: expected string
+        .base64 not-a-string
+
+# CHECK-ERROR: [[#@LINE+1]]:17: error: failed to base64 decode string data
+        .base64 "AA"
+
+# CHECK-ERROR: [[#@LINE+1]]:17: error: expected nonempty string
+        .base64 ""
+.endif

From 88b3969dfe20bd624209e44d5c8cef365d9484e0 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 17 Nov 2025 14:19:22 -0800
Subject: [PATCH 096/105] [lldb-dap] Address a unit test race condition during
 initialization. (#167981)

During the initialization sequence in our tests the first 'threads'
response sould only be kept if the process is actually stopped,
otherwise we will have stale data.

In VSCode, during the debug session startup sequence immediately after
'configurationDone' a 'threads' request is made. This initial request is
to retrieve the main threads name and id so the UI can be populated.
However, in our tests we do not want to cache this value unless the
process is actually stopped. We do need to make this initial request
because lldb-dap is caching the initial thread list during
configurationDone before the process is resumed. We need to make this
call to ensure the cached initial threads are purged.

I noticed this in a CI job for another review
(https://github.com/llvm/llvm-project/actions/runs/19348261989/job/55353961798)
where the tests incorrectly failed to fetch the threads prior to
validating the thread names.
---
 .../lldbsuite/test/tools/lldb-dap/dap_server.py   | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index a4ca090021f3f..f85ab1910a2eb 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -191,6 +191,11 @@ class NotSupportedError(KeyError):
 
 
 class DebugCommunication(object):
+    @property
+    def is_stopped(self) -> bool:
+        """Returns True if the debuggee is stopped, otherwise False."""
+        return len(self.thread_stop_reasons) > 0 or self.exit_status is not None
+
     def __init__(
         self,
         recv: BinaryIO,
@@ -860,7 +865,17 @@ def request_configurationDone(self):
         response = self._send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
+            stopped_on_entry = self.is_stopped
             self.request_threads()
+            if not stopped_on_entry:
+                # Drop the initial cached threads if we did not stop-on-entry.
+                # In VSCode, immediately following 'configurationDone', a
+                # 'threads' request is made to get the initial set of threads,
+                # specifically the main threads id and name.
+                # We issue the threads request to mimic this pattern but in our
+                # tests we don't want to cache the result unless the process is
+                # actually stopped.
+                self.threads = None
         return response
 
     def _process_stopped(self):

From e89e359313a036131e4926ce2a9a97b06f5993ce Mon Sep 17 00:00:00 2001
From: Prabhu Rajasekaran <prabhukr@google.com>
Date: Mon, 17 Nov 2025 14:25:58 -0800
Subject: [PATCH 097/105] [libc]Github] Fix typo on build_type param (#168453)

There is an extra underscore in build_type param in #167583 patch.
Fixing it in this PR.
---
 .github/workflows/libc-fullbuild-tests.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index c5b7f606a115a..01fd895cce7e8 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -49,37 +49,37 @@ jobs:
             target: x86_64-unknown-uefi-llvm
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv6m-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv7m-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv7em-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv8m.main-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv8.1m.main-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: riscv32-unknown-elf

From f0f53326c735b9f2cb52f5a63312f2d0b700d6cf Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Mon, 17 Nov 2025 14:55:52 -0800
Subject: [PATCH 098/105] [mlir][acc] Add ACCImplicitRoutine pass for implicit
 `acc routine` (#168433)

This change adds the ACCImplicitRoutine pass which implements the
OpenACC specification for implicit routine directives (OpenACC 3.4 spec,
section 2.15.1).

According to the specification: "If no explicit routine directive
applies to a procedure whose definition appears in the program unit
being compiled, then the implementation applies an implicit routine
directive to that procedure if any of the following conditions holds:
The procedure is called or its address is accessed in a compute region."

The pass automatically generates `acc.routine` operations for functions
called within OpenACC compute constructs or within existing routine
functions that do not already have explicit routine directives. It
recursively applies implicit routine directives while avoiding infinite
recursion when dependencies form cycles.

Key features:
- Walks through all OpenACC compute constructs (parallel, kernels,
serial) to identify function calls
- Creates implicit `acc.routine` operations for functions without
explicit routine declarations
- Recursively processes existing `acc.routine` operations to handle
transitive dependencies
- Avoids infinite recursion through proper tracking of processed
routines
- Respects device-type specific bind clauses to skip routines bound to
different device types

Requirements:
- Function operations must implement `mlir::FunctionOpInterface` to be
identified and associated with routine directives.
- Call operations must implement `mlir::CallOpInterface` to detect
function calls and traverse the call graph.
- Optionally pre-register `acc::OpenACCSupport` if custom behavior is
needed for determining if a symbol use is valid within GPU regions (such
as functions which are already considerations for offloading even
without `acc routine` markings)

Co-authored-by: delaram-talaashrafi<dtalaashrafi@nvidia.com>
---
 .../mlir/Dialect/OpenACC/Transforms/Passes.td |  38 +++
 .../OpenACC/Transforms/ACCImplicitRoutine.cpp | 237 ++++++++++++++++++
 .../Dialect/OpenACC/Transforms/CMakeLists.txt |   1 +
 3 files changed, 276 insertions(+)
 create mode 100644 mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp

diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 40ccd1fc6c1a0..970d9304d8289 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -63,4 +63,42 @@ def ACCImplicitData : Pass<"acc-implicit-data", "mlir::ModuleOp"> {
   ];
 }
 
+def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> {
+  let summary = "Generate implicit acc routine for functions in acc regions";
+  let description = [{
+    This pass implements the implicit rules described in OpenACC specification
+    for `Routine Directive` (OpenACC 3.4 spec, section 2.15.1).
+
+    "If no explicit routine directive applies to a procedure whose definition
+    appears in the program unit being compiled, then the implementation applies
+    an implicit routine directive to that procedure if any of the following
+    conditions holds:
+    - The procedure is called or its address is accessed in a compute region."
+
+    The specification further states:
+    "When the implementation applies an implicit routine directive to a procedure,
+    it must recursively apply implicit routine directives to other procedures for
+    which the above rules specify relevant dependencies. Such dependencies can
+    form a cycle, so the implementation must take care to avoid infinite recursion."
+
+    This pass implements these requirements by:
+    1. Walking through all OpenACC compute constructs and functions already
+       marked with `acc routine` in the module and identifying function calls
+       within these regions.
+    2. Creating implicit `acc.routine` operations for functions that don't already
+       have routine declarations.
+    3. Recursively walking through all existing `acc routine` and creating
+       implicit routine operations for function calls within these routines,
+       while avoiding infinite recursion through proper tracking.
+  }];
+  let dependentDialects = ["mlir::acc::OpenACCDialect"];
+  let options = [
+    Option<"deviceType", "device-type", "mlir::acc::DeviceType",
+           "mlir::acc::DeviceType::None",
+           "Target device type for implicit routine generation. "
+           "Ensures that `acc routine` device_type clauses are "
+           "properly considered not just default clauses.">
+  ];
+}
+
 #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp
new file mode 100644
index 0000000000000..12efaf487a8ca
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp
@@ -0,0 +1,237 @@
+//===- ACCImplicitRoutine.cpp - OpenACC Implicit Routine Transform -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the implicit rules described in OpenACC specification
+// for `Routine Directive` (OpenACC 3.4 spec, section 2.15.1).
+//
+// "If no explicit routine directive applies to a procedure whose definition
+// appears in the program unit being compiled, then the implementation applies
+// an implicit routine directive to that procedure if any of the following
+// conditions holds:
+// - The procedure is called or its address is accessed in a compute region."
+//
+// The specification further states:
+// "When the implementation applies an implicit routine directive to a
+// procedure, it must recursively apply implicit routine directives to other
+// procedures for which the above rules specify relevant dependencies. Such
+// dependencies can form a cycle, so the implementation must take care to avoid
+// infinite recursion."
+//
+// This pass implements these requirements by:
+// 1. Walking through all OpenACC compute constructs and functions already
+//    marked with `acc routine` in the module and identifying function calls
+//    within these regions.
+// 2. Creating implicit `acc.routine` operations for functions that don't
+//    already have routine declarations.
+// 3. Recursively walking through all existing `acc routine` and creating
+//    implicit routine operations for function calls within these routines,
+//    while avoiding infinite recursion through proper tracking.
+//
+// Requirements:
+// -------------
+// To use this pass in a pipeline, the following requirements must be met:
+//
+// 1. Operation Interface Implementation: Operations that define functions
+//    or call functions should implement `mlir::FunctionOpInterface` and
+//    `mlir::CallOpInterface` respectively.
+//
+// 2. Analysis Registration (Optional): If custom behavior is needed for
+//    determining if a symbol use is valid within GPU regions, the dialect
+//    should pre-register the `acc::OpenACCSupport` analysis.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include <queue>
+
+#define DEBUG_TYPE "acc-implicit-routine"
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCIMPLICITROUTINE
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+namespace {
+
+using namespace mlir;
+
+class ACCImplicitRoutine
+    : public acc::impl::ACCImplicitRoutineBase<ACCImplicitRoutine> {
+private:
+  unsigned routineCounter = 0;
+  static constexpr llvm::StringRef accRoutinePrefix = "acc_routine_";
+
+  // Count existing routine operations and update counter
+  void initRoutineCounter(ModuleOp module) {
+    module.walk([&](acc::RoutineOp routineOp) { routineCounter++; });
+  }
+
+  // Check if routine has a default bind clause or a device-type specific bind
+  // clause. Returns true if `acc routine` has a default bind clause or
+  // a device-type specific bind clause.
+  bool isACCRoutineBindDefaultOrDeviceType(acc::RoutineOp op,
+                                           acc::DeviceType deviceType) {
+    // Fast check to avoid device-type specific lookups.
+    if (!op.getBindIdName() && !op.getBindStrName())
+      return false;
+    return op.getBindNameValue().has_value() ||
+           op.getBindNameValue(deviceType).has_value();
+  }
+
+  // Generate a unique name for the routine and create the routine operation
+  acc::RoutineOp createRoutineOp(OpBuilder &builder, Location loc,
+                                 FunctionOpInterface &callee) {
+    std::string routineName =
+        (accRoutinePrefix + std::to_string(routineCounter++)).str();
+    auto routineOp = acc::RoutineOp::create(
+        builder, loc,
+        /* sym_name=*/builder.getStringAttr(routineName),
+        /* func_name=*/
+        mlir::SymbolRefAttr::get(builder.getContext(),
+                                 builder.getStringAttr(callee.getName())),
+        /* bindIdName=*/nullptr,
+        /* bindStrName=*/nullptr,
+        /* bindIdNameDeviceType=*/nullptr,
+        /* bindStrNameDeviceType=*/nullptr,
+        /* worker=*/nullptr,
+        /* vector=*/nullptr,
+        /* seq=*/nullptr,
+        /* nohost=*/nullptr,
+        /* implicit=*/builder.getUnitAttr(),
+        /* gang=*/nullptr,
+        /* gangDim=*/nullptr,
+        /* gangDimDeviceType=*/nullptr);
+
+    // Assert that the callee does not already have routine info attribute
+    assert(!callee->hasAttr(acc::getRoutineInfoAttrName()) &&
+           "function is already associated with a routine");
+
+    callee->setAttr(
+        acc::getRoutineInfoAttrName(),
+        mlir::acc::RoutineInfoAttr::get(
+            builder.getContext(),
+            {mlir::SymbolRefAttr::get(builder.getContext(),
+                                      builder.getStringAttr(routineName))}));
+    return routineOp;
+  }
+
+  // Used to walk through a compute region looking for function calls.
+  void
+  implicitRoutineForCallsInComputeRegions(Operation *op, SymbolTable &symTab,
+                                          mlir::OpBuilder &builder,
+                                          acc::OpenACCSupport &accSupport) {
+    op->walk([&](CallOpInterface callOp) {
+      if (!callOp.getCallableForCallee())
+        return;
+
+      auto calleeSymbolRef =
+          dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+      // When call is done through ssa value, the callee is not a symbol.
+      // Skip it because we don't know the call target.
+      if (!calleeSymbolRef)
+        return;
+
+      auto callee = symTab.lookup<FunctionOpInterface>(
+          calleeSymbolRef.getLeafReference().str());
+      // If the callee does not exist or is already a valid symbol for GPU
+      // regions, skip it
+
+      assert(callee && "callee function must be found in symbol table");
+      if (accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef))
+        return;
+      builder.setInsertionPoint(callee);
+      createRoutineOp(builder, callee.getLoc(), callee);
+    });
+  }
+
+  // Recursively handle calls within a routine operation
+  void implicitRoutineForCallsInRoutine(acc::RoutineOp routineOp,
+                                        mlir::OpBuilder &builder,
+                                        acc::OpenACCSupport &accSupport,
+                                        acc::DeviceType targetDeviceType) {
+    // When bind clause is used, it means that the target is different than the
+    // function to which the `acc routine` is used with. Skip this case to
+    // avoid implicitly recursively marking calls that would not end up on
+    // device.
+    if (isACCRoutineBindDefaultOrDeviceType(routineOp, targetDeviceType))
+      return;
+
+    SymbolTable symTab(routineOp->getParentOfType<ModuleOp>());
+    std::queue<acc::RoutineOp> routineQueue;
+    routineQueue.push(routineOp);
+    while (!routineQueue.empty()) {
+      auto currentRoutine = routineQueue.front();
+      routineQueue.pop();
+      auto func = symTab.lookup<FunctionOpInterface>(
+          currentRoutine.getFuncName().getLeafReference());
+      func.walk([&](CallOpInterface callOp) {
+        if (!callOp.getCallableForCallee())
+          return;
+
+        auto calleeSymbolRef =
+            dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+        // When call is done through ssa value, the callee is not a symbol.
+        // Skip it because we don't know the call target.
+        if (!calleeSymbolRef)
+          return;
+
+        auto callee = symTab.lookup<FunctionOpInterface>(
+            calleeSymbolRef.getLeafReference().str());
+        // If the callee does not exist or is already a valid symbol for GPU
+        // regions, skip it
+        assert(callee && "callee function must be found in symbol table");
+        if (accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef))
+          return;
+        builder.setInsertionPoint(callee);
+        auto newRoutineOp = createRoutineOp(builder, callee.getLoc(), callee);
+        routineQueue.push(newRoutineOp);
+      });
+    }
+  }
+
+public:
+  using ACCImplicitRoutineBase<ACCImplicitRoutine>::ACCImplicitRoutineBase;
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::OpBuilder builder(module.getContext());
+    SymbolTable symTab(module);
+    initRoutineCounter(module);
+
+    acc::OpenACCSupport &accSupport = getAnalysis<acc::OpenACCSupport>();
+
+    // Handle compute regions
+    module.walk([&](Operation *op) {
+      if (isa<ACC_COMPUTE_CONSTRUCT_OPS>(op))
+        implicitRoutineForCallsInComputeRegions(op, symTab, builder,
+                                                accSupport);
+    });
+
+    // Use the device type option from the pass options.
+    acc::DeviceType targetDeviceType = deviceType;
+
+    // Handle existing routines
+    module.walk([&](acc::RoutineOp routineOp) {
+      implicitRoutineForCallsInRoutine(routineOp, builder, accSupport,
+                                       targetDeviceType);
+    });
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index f8fff5958f8c7..028af0362f26e 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIROpenACCTransforms
   ACCImplicitData.cpp
+  ACCImplicitRoutine.cpp
   LegalizeDataValues.cpp
 
   ADDITIONAL_HEADER_DIRS

From 43dacd07f660064d4023342efb067f39fafc592f Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Tue, 18 Nov 2025 01:58:26 +0300
Subject: [PATCH 099/105] [PowerPC] TableGen-erate SDNode descriptions
 (#168108)

This allows SDNodes to be validated against their expected type profiles
and reduces the number of changes required to add a new node.

The validation functionality has detected several issues, see
`PPCSelectionDAGInfo::verifyTargetNode()`.

Most of the nodes have a description in `*.td` files and were
successfully "imported". Those that don't have a description are listed
in the enum in `PPCSelectionDAGInfo.td`. These nodes are not validated.

Part of #119709.

Pull Request: https://github.com/llvm/llvm-project/pull/168108
---
 llvm/lib/Target/PowerPC/CMakeLists.txt        |   1 +
 llvm/lib/Target/PowerPC/PPCFastISel.cpp       |   1 +
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |   1 +
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 185 +-----
 llvm/lib/Target/PowerPC/PPCISelLowering.h     | 578 ------------------
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       | 301 ++++++++-
 llvm/lib/Target/PowerPC/PPCInstrP10.td        |  21 +-
 llvm/lib/Target/PowerPC/PPCInstrVSX.td        |  60 ++
 .../Target/PowerPC/PPCSelectionDAGInfo.cpp    |  66 +-
 llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h |  57 +-
 10 files changed, 477 insertions(+), 794 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 2182039e0eef8..53d565013c4bc 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM PPCGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM PPCGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM PPCGenExegesis.inc -gen-exegesis)
 tablegen(LLVM PPCGenRegisterBank.inc -gen-register-bank)
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index ea4e597d0fd7d..ca3fe18273ff5 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -17,6 +17,7 @@
 #include "PPCCallingConv.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 89165fa8f8fdb..dd537c204cec1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APInt.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f55336bafd251..220010c4d3d34 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCRegisterInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APFloat.h"
@@ -1678,190 +1679,6 @@ bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
   return false;
 }
 
-const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((PPCISD::NodeType)Opcode) {
-  case PPCISD::FIRST_NUMBER:    break;
-  case PPCISD::FSEL:            return "PPCISD::FSEL";
-  case PPCISD::XSMAXC:          return "PPCISD::XSMAXC";
-  case PPCISD::XSMINC:          return "PPCISD::XSMINC";
-  case PPCISD::FCFID:           return "PPCISD::FCFID";
-  case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
-  case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
-  case PPCISD::FCFIDUS:         return "PPCISD::FCFIDUS";
-  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
-  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
-  case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
-  case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
-  case PPCISD::FRE:             return "PPCISD::FRE";
-  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
-  case PPCISD::FTSQRT:
-    return "PPCISD::FTSQRT";
-  case PPCISD::FSQRT:
-    return "PPCISD::FSQRT";
-  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
-  case PPCISD::VPERM:           return "PPCISD::VPERM";
-  case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
-  case PPCISD::XXSPLTI_SP_TO_DP:
-    return "PPCISD::XXSPLTI_SP_TO_DP";
-  case PPCISD::XXSPLTI32DX:
-    return "PPCISD::XXSPLTI32DX";
-  case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
-  case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
-  case PPCISD::XXPERM:
-    return "PPCISD::XXPERM";
-  case PPCISD::VECSHL:          return "PPCISD::VECSHL";
-  case PPCISD::VSRQ:
-    return "PPCISD::VSRQ";
-  case PPCISD::CMPB:            return "PPCISD::CMPB";
-  case PPCISD::Hi:              return "PPCISD::Hi";
-  case PPCISD::Lo:              return "PPCISD::Lo";
-  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
-  case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
-  case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
-  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
-  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
-  case PPCISD::PROBED_ALLOCA:   return "PPCISD::PROBED_ALLOCA";
-  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
-  case PPCISD::SRL:             return "PPCISD::SRL";
-  case PPCISD::SRA:             return "PPCISD::SRA";
-  case PPCISD::SHL:             return "PPCISD::SHL";
-  case PPCISD::SRA_ADDZE:       return "PPCISD::SRA_ADDZE";
-  case PPCISD::CALL:            return "PPCISD::CALL";
-  case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
-  case PPCISD::CALL_NOTOC:      return "PPCISD::CALL_NOTOC";
-  case PPCISD::CALL_RM:
-    return "PPCISD::CALL_RM";
-  case PPCISD::CALL_NOP_RM:
-    return "PPCISD::CALL_NOP_RM";
-  case PPCISD::CALL_NOTOC_RM:
-    return "PPCISD::CALL_NOTOC_RM";
-  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
-  case PPCISD::BCTRL:           return "PPCISD::BCTRL";
-  case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
-  case PPCISD::BCTRL_RM:
-    return "PPCISD::BCTRL_RM";
-  case PPCISD::BCTRL_LOAD_TOC_RM:
-    return "PPCISD::BCTRL_LOAD_TOC_RM";
-  case PPCISD::RET_GLUE:        return "PPCISD::RET_GLUE";
-  case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
-  case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
-  case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
-  case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
-  case PPCISD::MFVSR:           return "PPCISD::MFVSR";
-  case PPCISD::MTVSRA:          return "PPCISD::MTVSRA";
-  case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
-  case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
-  case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
-  case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
-    return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
-  case PPCISD::ANDI_rec_1_EQ_BIT:
-    return "PPCISD::ANDI_rec_1_EQ_BIT";
-  case PPCISD::ANDI_rec_1_GT_BIT:
-    return "PPCISD::ANDI_rec_1_GT_BIT";
-  case PPCISD::VCMP:            return "PPCISD::VCMP";
-  case PPCISD::VCMP_rec:        return "PPCISD::VCMP_rec";
-  case PPCISD::LBRX:            return "PPCISD::LBRX";
-  case PPCISD::STBRX:           return "PPCISD::STBRX";
-  case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
-  case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
-  case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
-  case PPCISD::STXSIX:          return "PPCISD::STXSIX";
-  case PPCISD::VEXTS:           return "PPCISD::VEXTS";
-  case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
-  case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
-  case PPCISD::LOAD_VEC_BE:     return "PPCISD::LOAD_VEC_BE";
-  case PPCISD::STORE_VEC_BE:    return "PPCISD::STORE_VEC_BE";
-  case PPCISD::ST_VSR_SCAL_INT:
-                                return "PPCISD::ST_VSR_SCAL_INT";
-  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
-  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
-  case PPCISD::BDZ:             return "PPCISD::BDZ";
-  case PPCISD::MFFS:            return "PPCISD::MFFS";
-  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
-  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
-  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
-  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
-  case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
-  case PPCISD::PPC32_PICGOT:    return "PPCISD::PPC32_PICGOT";
-  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
-  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
-  case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
-  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
-  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
-  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
-  case PPCISD::GET_TLS_MOD_AIX: return "PPCISD::GET_TLS_MOD_AIX";
-  case PPCISD::GET_TPOINTER:    return "PPCISD::GET_TPOINTER";
-  case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
-  case PPCISD::TLSGD_AIX:       return "PPCISD::TLSGD_AIX";
-  case PPCISD::TLSLD_AIX:       return "PPCISD::TLSLD_AIX";
-  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
-  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
-  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
-  case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
-  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
-  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
-  case PPCISD::PADDI_DTPREL:
-    return "PPCISD::PADDI_DTPREL";
-  case PPCISD::VADD_SPLAT:
-    return "PPCISD::VADD_SPLAT";
-  case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
-  case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
-  case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
-  case PPCISD::BUILD_SPE64:     return "PPCISD::BUILD_SPE64";
-  case PPCISD::EXTRACT_SPE:     return "PPCISD::EXTRACT_SPE";
-  case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
-  case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
-  case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
-  case PPCISD::MAT_PCREL_ADDR:  return "PPCISD::MAT_PCREL_ADDR";
-  case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
-    return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
-  case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
-    return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
-  case PPCISD::ACC_BUILD:       return "PPCISD::ACC_BUILD";
-  case PPCISD::PAIR_BUILD:      return "PPCISD::PAIR_BUILD";
-  case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
-  case PPCISD::XXMFACC:         return "PPCISD::XXMFACC";
-  case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
-  case PPCISD::ZEXT_LD_SPLAT:   return "PPCISD::ZEXT_LD_SPLAT";
-  case PPCISD::SEXT_LD_SPLAT:   return "PPCISD::SEXT_LD_SPLAT";
-  case PPCISD::FNMSUB:          return "PPCISD::FNMSUB";
-  case PPCISD::STRICT_FADDRTZ:
-    return "PPCISD::STRICT_FADDRTZ";
-  case PPCISD::STRICT_FCTIDZ:
-    return "PPCISD::STRICT_FCTIDZ";
-  case PPCISD::STRICT_FCTIWZ:
-    return "PPCISD::STRICT_FCTIWZ";
-  case PPCISD::STRICT_FCTIDUZ:
-    return "PPCISD::STRICT_FCTIDUZ";
-  case PPCISD::STRICT_FCTIWUZ:
-    return "PPCISD::STRICT_FCTIWUZ";
-  case PPCISD::STRICT_FCFID:
-    return "PPCISD::STRICT_FCFID";
-  case PPCISD::STRICT_FCFIDU:
-    return "PPCISD::STRICT_FCFIDU";
-  case PPCISD::STRICT_FCFIDS:
-    return "PPCISD::STRICT_FCFIDS";
-  case PPCISD::STRICT_FCFIDUS:
-    return "PPCISD::STRICT_FCFIDUS";
-  case PPCISD::LXVRZX:          return "PPCISD::LXVRZX";
-  case PPCISD::STORE_COND:
-    return "PPCISD::STORE_COND";
-  case PPCISD::SETBC:
-    return "PPCISD::SETBC";
-  case PPCISD::SETBCR:
-    return "PPCISD::SETBCR";
-  case PPCISD::ADDC:
-    return "PPCISD::ADDC";
-  case PPCISD::ADDE:
-    return "PPCISD::ADDE";
-  case PPCISD::SUBC:
-    return "PPCISD::SUBC";
-  case PPCISD::SUBE:
-    return "PPCISD::SUBE";
-  }
-  return nullptr;
-}
-
 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
                                           EVT VT) const {
   if (!VT.isVector())
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index d967018982734..680b529b4e2e5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -34,580 +34,6 @@
 
 namespace llvm {
 
-  namespace PPCISD {
-
-  // When adding a NEW PPCISD node please add it to the correct position in
-  // the enum. The order of elements in this enum matters!
-  // Values that are added between FIRST_MEMORY_OPCODE and LAST_MEMORY_OPCODE
-  // are considered memory opcodes and are treated differently than other
-  // entries.
-  enum NodeType : unsigned {
-    // Start the numbering where the builtin ops and target ops leave off.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    /// FSEL - Traditional three-operand fsel node.
-    ///
-    FSEL,
-
-    /// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
-    XSMAXC,
-    XSMINC,
-
-    /// FCFID - The FCFID instruction, taking an f64 operand and producing
-    /// and f64 value containing the FP representation of the integer that
-    /// was temporarily in the f64 operand.
-    FCFID,
-
-    /// Newer FCFID[US] integer-to-floating-point conversion instructions for
-    /// unsigned integers and single-precision outputs.
-    FCFIDU,
-    FCFIDS,
-    FCFIDUS,
-
-    /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
-    /// operand, producing an f64 value containing the integer representation
-    /// of that FP value.
-    FCTIDZ,
-    FCTIWZ,
-
-    /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
-    /// unsigned integers with round toward zero.
-    FCTIDUZ,
-    FCTIWUZ,
-
-    /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
-    /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
-    VEXTS,
-
-    /// Reciprocal estimate instructions (unary FP ops).
-    FRE,
-    FRSQRTE,
-
-    /// Test instruction for software square root.
-    FTSQRT,
-
-    /// Square root instruction.
-    FSQRT,
-
-    /// VPERM - The PPC VPERM Instruction.
-    ///
-    VPERM,
-
-    /// XXSPLT - The PPC VSX splat instructions
-    ///
-    XXSPLT,
-
-    /// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
-    /// converting immediate single precision numbers to double precision
-    /// vector or scalar.
-    XXSPLTI_SP_TO_DP,
-
-    /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
-    ///
-    XXSPLTI32DX,
-
-    /// VECINSERT - The PPC vector insert instruction
-    ///
-    VECINSERT,
-
-    /// VECSHL - The PPC vector shift left instruction
-    ///
-    VECSHL,
-
-    /// XXPERMDI - The PPC XXPERMDI instruction
-    ///
-    XXPERMDI,
-    XXPERM,
-
-    /// The CMPB instruction (takes two operands of i32 or i64).
-    CMPB,
-
-    /// Hi/Lo - These represent the high and low 16-bit parts of a global
-    /// address respectively.  These nodes have two operands, the first of
-    /// which must be a TargetGlobalAddress, and the second of which must be a
-    /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
-    /// though these are usually folded into other nodes.
-    Hi,
-    Lo,
-
-    /// The following two target-specific nodes are used for calls through
-    /// function pointers in the 64-bit SVR4 ABI.
-
-    /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
-    /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
-    /// compute an allocation on the stack.
-    DYNALLOC,
-
-    /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
-    /// compute an offset from native SP to the address  of the most recent
-    /// dynamic alloca.
-    DYNAREAOFFSET,
-
-    /// To avoid stack clash, allocation is performed by block and each block is
-    /// probed.
-    PROBED_ALLOCA,
-
-    /// The result of the mflr at function entry, used for PIC code.
-    GlobalBaseReg,
-
-    /// These nodes represent PPC shifts.
-    ///
-    /// For scalar types, only the last `n + 1` bits of the shift amounts
-    /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
-    /// for exact behaviors.
-    ///
-    /// For vector types, only the last n bits are used. See vsld.
-    SRL,
-    SRA,
-    SHL,
-
-    /// These nodes represent PPC arithmetic operations with carry.
-    ADDC,
-    ADDE,
-    SUBC,
-    SUBE,
-
-    /// FNMSUB - Negated multiply-subtract instruction.
-    FNMSUB,
-
-    /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
-    /// word and shift left immediate.
-    EXTSWSLI,
-
-    /// The combination of sra[wd]i and addze used to implemented signed
-    /// integer division by a power of 2. The first operand is the dividend,
-    /// and the second is the constant shift amount (representing the
-    /// divisor).
-    SRA_ADDZE,
-
-    /// CALL - A direct function call.
-    /// CALL_NOP is a call with the special NOP which follows 64-bit
-    /// CALL_NOTOC the caller does not use the TOC.
-    /// SVR4 calls and 32-bit/64-bit AIX calls.
-    CALL,
-    CALL_NOP,
-    CALL_NOTOC,
-
-    /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
-    /// MTCTR instruction.
-    MTCTR,
-
-    /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
-    /// BCTRL instruction.
-    BCTRL,
-
-    /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
-    /// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
-    /// and 64-bit AIX.
-    BCTRL_LOAD_TOC,
-
-    /// The variants that implicitly define rounding mode for calls with
-    /// strictfp semantics.
-    CALL_RM,
-    CALL_NOP_RM,
-    CALL_NOTOC_RM,
-    BCTRL_RM,
-    BCTRL_LOAD_TOC_RM,
-
-    /// Return with a glue operand, matched by 'blr'
-    RET_GLUE,
-
-    /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
-    /// This copies the bits corresponding to the specified CRREG into the
-    /// resultant GPR.  Bits corresponding to other CR regs are undefined.
-    MFOCRF,
-
-    /// Direct move from a VSX register to a GPR
-    MFVSR,
-
-    /// Direct move from a GPR to a VSX register (algebraic)
-    MTVSRA,
-
-    /// Direct move from a GPR to a VSX register (zero)
-    MTVSRZ,
-
-    /// Direct move of 2 consecutive GPR to a VSX register.
-    BUILD_FP128,
-
-    /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
-    /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
-    /// unsupported for this target.
-    /// Merge 2 GPRs to a single SPE register.
-    BUILD_SPE64,
-
-    /// Extract SPE register component, second argument is high or low.
-    EXTRACT_SPE,
-
-    /// Extract a subvector from signed integer vector and convert to FP.
-    /// It is primarily used to convert a (widened) illegal integer vector
-    /// type to a legal floating point vector type.
-    /// For example v2i32 -> widened to v4i32 -> v2f64
-    SINT_VEC_TO_FP,
-
-    /// Extract a subvector from unsigned integer vector and convert to FP.
-    /// As with SINT_VEC_TO_FP, used for converting illegal types.
-    UINT_VEC_TO_FP,
-
-    /// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
-    /// place the value into the least significant element of the most
-    /// significant doubleword in the vector. This is not element zero for
-    /// anything smaller than a doubleword on either endianness. This node has
-    /// the same semantics as SCALAR_TO_VECTOR except that the value remains in
-    /// the aforementioned location in the vector register.
-    SCALAR_TO_VECTOR_PERMUTED,
-
-    // FIXME: Remove these once the ANDI glue bug is fixed:
-    /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
-    /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
-    /// implement truncation of i32 or i64 to i1.
-    ANDI_rec_1_EQ_BIT,
-    ANDI_rec_1_GT_BIT,
-
-    // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
-    // target (returns (Lo, Hi)). It takes a chain operand.
-    READ_TIME_BASE,
-
-    // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
-    EH_SJLJ_SETJMP,
-
-    // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
-    EH_SJLJ_LONGJMP,
-
-    /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
-    /// instructions.  For lack of better number, we use the opcode number
-    /// encoding for the OPC field to identify the compare.  For example, 838
-    /// is VCMPGTSH.
-    VCMP,
-
-    /// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
-    /// altivec VCMP*_rec instructions.  For lack of better number, we use the
-    /// opcode number encoding for the OPC field to identify the compare.  For
-    /// example, 838 is VCMPGTSH.
-    VCMP_rec,
-
-    /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
-    /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
-    /// condition register to branch on, OPC is the branch opcode to use (e.g.
-    /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
-    /// an optional input flag argument.
-    COND_BRANCH,
-
-    /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
-    /// loops.
-    BDNZ,
-    BDZ,
-
-    /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
-    /// towards zero.  Used only as part of the long double-to-int
-    /// conversion sequence.
-    FADDRTZ,
-
-    /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
-    MFFS,
-
-    /// TC_RETURN - A tail call return.
-    ///   operand #0 chain
-    ///   operand #1 callee (register or absolute)
-    ///   operand #2 stack adjustment
-    ///   operand #3 optional in flag
-    TC_RETURN,
-
-    /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
-    CR6SET,
-    CR6UNSET,
-
-    /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
-    /// for non-position independent code on PPC32.
-    PPC32_GOT,
-
-    /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
-    /// local dynamic TLS and position indendepent code on PPC32.
-    PPC32_PICGOT,
-
-    /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
-    /// TLS model, produces an ADDIS8 instruction that adds the GOT
-    /// base to sym\@got\@tprel\@ha.
-    ADDIS_GOT_TPREL_HA,
-
-    /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
-    /// TLS model, produces a LD instruction with base register G8RReg
-    /// and offset sym\@got\@tprel\@l.  This completes the addition that
-    /// finds the offset of "sym" relative to the thread pointer.
-    LD_GOT_TPREL_L,
-
-    /// G8RC = ADD_TLS G8RReg, Symbol - Can be used by the initial-exec
-    /// and local-exec TLS models, produces an ADD instruction that adds
-    /// the contents of G8RReg to the thread pointer.  Symbol contains a
-    /// relocation sym\@tls which is to be replaced by the thread pointer
-    /// and identifies to the linker that the instruction is part of a
-    /// TLS sequence.
-    ADD_TLS,
-
-    /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds the GOT base
-    /// register to sym\@got\@tlsgd\@ha.
-    ADDIS_TLSGD_HA,
-
-    /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
-    /// ADDIS_TLSGD_L_ADDR until after register assignment.
-    ADDI_TLSGD_L,
-
-    /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
-    /// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
-    /// ADDIS_TLSGD_L_ADDR until after register assignment.
-    GET_TLS_ADDR,
-
-    /// %x3 = GET_TPOINTER - Used for the local- and initial-exec TLS model on
-    /// 32-bit AIX, produces a call to .__get_tpointer to retrieve the thread
-    /// pointer. At the end of the call, the thread pointer is found in R3.
-    GET_TPOINTER,
-
-    /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
-    /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
-    /// register assignment.
-    ADDI_TLSGD_L_ADDR,
-
-    /// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
-    /// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
-    /// Op that combines two register copies of TOC entries
-    /// (region handle into R3 and variable offset into R4) followed by a
-    /// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
-    /// This node is used in 64-bit mode as well (in which case the result is
-    /// G8RC and inputs are X3/X4).
-    TLSGD_AIX,
-
-    /// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
-    /// produces a call to .__tls_get_mod(_$TLSML\@ml).
-    GET_TLS_MOD_AIX,
-
-    /// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
-    /// Op that requires a single input of the module handle TOC entry in R3,
-    /// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
-    /// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
-    /// The only difference is the register class.
-    TLSLD_AIX,
-
-    /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds the GOT base
-    /// register to sym\@got\@tlsld\@ha.
-    ADDIS_TLSLD_HA,
-
-    /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
-    /// ADDIS_TLSLD_L_ADDR until after register assignment.
-    ADDI_TLSLD_L,
-
-    /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
-    /// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
-    /// ADDIS_TLSLD_L_ADDR until after register assignment.
-    GET_TLSLD_ADDR,
-
-    /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
-    /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
-    /// following register assignment.
-    ADDI_TLSLD_L_ADDR,
-
-    /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds X3 to
-    /// sym\@dtprel\@ha.
-    ADDIS_DTPREL_HA,
-
-    /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@dtprel\@l.
-    ADDI_DTPREL_L,
-
-    /// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
-    /// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
-    PADDI_DTPREL,
-
-    /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
-    /// during instruction selection to optimize a BUILD_VECTOR into
-    /// operations on splats.  This is necessary to avoid losing these
-    /// optimizations due to constant folding.
-    VADD_SPLAT,
-
-    /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
-    /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
-    /// or stxvd2x instruction.  The chain is necessary because the
-    /// sequence replaces a load and needs to provide the same number
-    /// of outputs.
-    XXSWAPD,
-
-    /// An SDNode for swaps that are not associated with any loads/stores
-    /// and thereby have no chain.
-    SWAP_NO_CHAIN,
-
-    /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
-    /// lower (IDX=1) half of v4f32 to v2f64.
-    FP_EXTEND_HALF,
-
-    /// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
-    /// either through an add like PADDI or through a PC Relative load like
-    /// PLD.
-    MAT_PCREL_ADDR,
-
-    /// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
-    /// TLS global address when using dynamic access models. This can be done
-    /// through an add like PADDI.
-    TLS_DYNAMIC_MAT_PCREL_ADDR,
-
-    /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
-    /// when using local exec access models, and when prefixed instructions are
-    /// available. This is used with ADD_TLS to produce an add like PADDI.
-    TLS_LOCAL_EXEC_MAT_ADDR,
-
-    /// ACC_BUILD = Build an accumulator register from 4 VSX registers.
-    ACC_BUILD,
-
-    /// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
-    PAIR_BUILD,
-
-    /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
-    /// an accumulator or pair register. This node is needed because
-    /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
-    /// element type.
-    EXTRACT_VSX_REG,
-
-    /// XXMFACC = This corresponds to the xxmfacc instruction.
-    XXMFACC,
-
-    // Constrained conversion from floating point to int
-    FIRST_STRICTFP_OPCODE,
-    STRICT_FCTIDZ = FIRST_STRICTFP_OPCODE,
-    STRICT_FCTIWZ,
-    STRICT_FCTIDUZ,
-    STRICT_FCTIWUZ,
-
-    /// Constrained integer-to-floating-point conversion instructions.
-    STRICT_FCFID,
-    STRICT_FCFIDU,
-    STRICT_FCFIDS,
-    STRICT_FCFIDUS,
-
-    /// Constrained floating point add in round-to-zero mode.
-    STRICT_FADDRTZ,
-    LAST_STRICTFP_OPCODE = STRICT_FADDRTZ,
-
-    /// SETBC - The ISA 3.1 (P10) SETBC instruction.
-    SETBC,
-
-    /// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
-    SETBCR,
-
-    /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
-    VSRQ,
-
-    // NOTE: The nodes below may require PC-Rel specific patterns if the
-    // address could be PC-Relative. When adding new nodes below, consider
-    // whether or not the address can be PC-Relative and add the corresponding
-    // PC-relative patterns and tests.
-
-    /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
-    /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
-    /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
-    /// i32.
-    FIRST_MEMORY_OPCODE,
-    STBRX = FIRST_MEMORY_OPCODE,
-
-    /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
-    /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
-    /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
-    /// or i32.
-    LBRX,
-
-    /// STFIWX - The STFIWX instruction.  The first operand is an input token
-    /// chain, then an f64 value to store, then an address to store it to.
-    STFIWX,
-
-    /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
-    /// load which sign-extends from a 32-bit integer value into the
-    /// destination 64-bit register.
-    LFIWAX,
-
-    /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
-    /// load which zero-extends from a 32-bit integer value into the
-    /// destination 64-bit register.
-    LFIWZX,
-
-    /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
-    /// integer smaller than 64 bits into a VSR. The integer is zero-extended.
-    /// This can be used for converting loaded integers to floating point.
-    LXSIZX,
-
-    /// STXSIX - The STXSI[bh]X instruction. The first operand is an input
-    /// chain, then an f64 value to store, then an address to store it to,
-    /// followed by a byte-width for the store.
-    STXSIX,
-
-    /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
-    /// Maps directly to an lxvd2x instruction that will be followed by
-    /// an xxswapd.
-    LXVD2X,
-
-    /// LXVRZX - Load VSX Vector Rightmost and Zero Extend
-    /// This node represents v1i128 BUILD_VECTOR of a zero extending load
-    /// instruction from <byte, halfword, word, or doubleword> to i128.
-    /// Allows utilization of the Load VSX Vector Rightmost Instructions.
-    LXVRZX,
-
-    /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
-    /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
-    /// the vector type to load vector in big-endian element order.
-    LOAD_VEC_BE,
-
-    /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
-    /// v2f32 value into the lower half of a VSR register.
-    LD_VSX_LH,
-
-    /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// instructions such as LXVDSX, LXVWSX.
-    LD_SPLAT,
-
-    /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// that zero-extends.
-    ZEXT_LD_SPLAT,
-
-    /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// that sign-extends.
-    SEXT_LD_SPLAT,
-
-    /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
-    /// Maps directly to an stxvd2x instruction that will be preceded by
-    /// an xxswapd.
-    STXVD2X,
-
-    /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
-    /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
-    /// the vector type to store vector in big-endian element order.
-    STORE_VEC_BE,
-
-    /// Store scalar integers from VSR.
-    ST_VSR_SCAL_INT,
-
-    /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
-    /// except they ensure that the compare input is zero-extended for
-    /// sub-word versions because the atomic loads zero-extend.
-    ATOMIC_CMP_SWAP_8,
-    ATOMIC_CMP_SWAP_16,
-
-    /// CHAIN,Glue = STORE_COND CHAIN, GPR, Ptr
-    /// The store conditional instruction ST[BHWD]ARX that produces a glue
-    /// result to attach it to a conditional branch.
-    STORE_COND,
-
-    /// GPRC = TOC_ENTRY GA, TOC
-    /// Loads the entry for GA from the TOC, where the TOC base is given by
-    /// the last operand.
-    TOC_ENTRY,
-    LAST_MEMORY_OPCODE = TOC_ENTRY,
-  };
-
-  } // end namespace PPCISD
-
   /// Define some predicates that are used for node matching.
   namespace PPC {
 
@@ -752,10 +178,6 @@ namespace llvm {
     explicit PPCTargetLowering(const PPCTargetMachine &TM,
                                const PPCSubtarget &STI);
 
-    /// getTargetNodeName() - This method returns the name of a target specific
-    /// DAG node.
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
     bool isSelectSupported(SelectSupportKind Kind) const override {
       // PowerPC does not support scalar condition selects on vectors.
       return (Kind != SelectSupportKind::ScalarCondVectorVal);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index f3998113ddd52..3ecc58c04e378 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -149,28 +149,49 @@ def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [
 
 def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+// Square root instruction.
 def PPCfsqrt  : SDNode<"PPCISD::FSQRT",   SDTFPUnaryOp, []>;
+
+// Test instruction for software square root.
 def PPCftsqrt : SDNode<"PPCISD::FTSQRT",  SDT_PPCFtsqrt,[]>;
 
+// FCFID - The FCFID instruction, taking an f64 operand and producing
+// and f64 value containing the FP representation of the integer that
+// was temporarily in the f64 operand.
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
+
+// Newer FCFID[US] integer-to-floating-point conversion instructions for
+// unsigned integers and single-precision outputs.
 def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
 def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
 def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+
+// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+// operand, producing an f64 value containing the integer representation
+// of that FP value.
 def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
 def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+
+// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+// unsigned integers with round toward zero.
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 
+// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
 def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
 
-def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
-                             SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
-                             SDTFPRoundOp, [SDNPHasChain]>;
-def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
-                              SDTFPRoundOp, [SDNPHasChain]>;
+// Constrained integer-to-floating-point conversion instructions.
+let IsStrictFP = true in {
+  def PPCstrict_fcfid  : SDNode<"PPCISD::STRICT_FCFID",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
+                                SDTFPRoundOp, [SDNPHasChain]>;
+  def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
+                                 SDTFPRoundOp, [SDNPHasChain]>;
+}
 
 def PPCany_fcfid : PatFrags<(ops node:$op),
                              [(PPCfcfid node:$op),
@@ -185,28 +206,56 @@ def PPCany_fcfidus : PatFrags<(ops node:$op),
                               [(PPCfcfidus node:$op),
                                (PPCstrict_fcfidus node:$op)]>;
 
+// Store scalar integers from VSR.
 def PPCstore_scal_int_from_vsr:
    SDNode<"PPCISD::ST_VSR_SCAL_INT", SDT_PPCstore_scal_int_from_vsr,
            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// STFIWX - The STFIWX instruction.  The first operand is an input token
+// chain, then an f64 value to store, then an address to store it to.
 def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+// load which sign-extends from a 32-bit integer value into the
+// destination 64-bit register.
 def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+// load which zero-extends from a 32-bit integer value into the
+// destination 64-bit register.
 def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
+// integer smaller than 64 bits into a VSR. The integer is zero-extended.
+// This can be used for converting loaded integers to floating point.
 def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// STXSIX - The STXSI[bh]X instruction. The first operand is an input
+// chain, then an f64 value to store, then an address to store it to,
+// followed by a byte-width for the store.
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
+// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
 def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
 
-// Extract FPSCR (not modeled at the DAG level).
+// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
 def PPCmffs   : SDNode<"PPCISD::MFFS",
                        SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>,
                        [SDNPHasChain]>;
 
-// Perform FADD in round-to-zero mode.
+// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+// towards zero.  Used only as part of the long double-to-int
+// conversion sequence.
 def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+// Constrained floating point add in round-to-zero mode.
+let IsStrictFP = true in
 def PPCstrict_faddrtz: SDNode<"PPCISD::STRICT_FADDRTZ", SDTFPBinOp,
                               [SDNPHasChain]>;
 
@@ -214,72 +263,194 @@ def PPCany_faddrtz: PatFrags<(ops node:$lhs, node:$rhs),
                              [(PPCfaddrtz node:$lhs, node:$rhs),
                               (PPCstrict_faddrtz node:$lhs, node:$rhs)]>;
 
+// FSEL - Traditional three-operand fsel node.
 def PPCfsel   : SDNode<"PPCISD::FSEL",
    // Type constraint for fsel.
    SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
                         SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
 def PPCxsmaxc : SDNode<"PPCISD::XSMAXC", SDT_PPCFPMinMax, []>;
 def PPCxsminc : SDNode<"PPCISD::XSMINC", SDT_PPCFPMinMax, []>;
+
+// Hi/Lo - These represent the high and low 16-bit parts of a global
+// address respectively.  These nodes have two operands, the first of
+// which must be a TargetGlobalAddress, and the second of which must be a
+// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+// though these are usually folded into other nodes.
 def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+
+// GPRC = TOC_ENTRY GA, TOC
+// Loads the entry for GA from the TOC, where the TOC base is given by
+// the last operand.
 def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
                          [SDNPMayLoad, SDNPMemOperand]>;
 
+// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+// for non-position independent code on PPC32.
 def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
 
+// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
+// TLS model, produces an ADDIS8 instruction that adds the GOT
+// base to sym\@got\@tprel\@ha.
 def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+
+// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+// TLS model, produces a LD instruction with base register G8RReg
+// and offset sym\@got\@tprel\@l.  This completes the addition that
+// finds the offset of "sym" relative to the thread pointer.
 def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
                             [SDNPMayLoad]>;
+
+// G8RC = ADD_TLS G8RReg, Symbol - Can be used by the initial-exec
+// and local-exec TLS models, produces an ADD instruction that adds
+// the contents of G8RReg to the thread pointer.  Symbol contains a
+// relocation sym\@tls which is to be replaced by the thread pointer
+// and identifies to the linker that the instruction is part of a
+// TLS sequence.
 def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+
+// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
+// model, produces an ADDIS8 instruction that adds the GOT base
+// register to sym\@got\@tlsgd\@ha.
 def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+
+// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
+// ADDIS_TLSGD_L_ADDR until after register assignment.
 def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+
+// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
+// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
+// ADDIS_TLSGD_L_ADDR until after register assignment.
 def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+
+// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
+// produces a call to .__tls_get_mod(_$TLSML\@ml).
 def PPCgetTlsMod   : SDNode<"PPCISD::GET_TLS_MOD_AIX", SDTIntUnaryOp>;
+
+// %x3 = GET_TPOINTER - Used for the local- and initial-exec TLS model on
+// 32-bit AIX, produces a call to .__get_tpointer to retrieve the thread
+// pointer. At the end of the call, the thread pointer is found in R3.
 def PPCgetTpointer : SDNode<"PPCISD::GET_TPOINTER", SDTIntLeaf, []>;
+
+// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
+// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
+// register assignment.
 def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
                                SDTypeProfile<1, 3, [
                                  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+
+// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+// Op that combines two register copies of TOC entries
+// (region handle into R3 and variable offset into R4) followed by a
+// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
+// This node is used in 64-bit mode as well (in which case the result is
+// G8RC and inputs are X3/X4).
 def PPCTlsgdAIX     : SDNode<"PPCISD::TLSGD_AIX", SDTIntBinOp>;
+
+// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
+// Op that requires a single input of the module handle TOC entry in R3,
+// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
+// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
+// The only difference is the register class.
 def PPCTlsldAIX     : SDNode<"PPCISD::TLSLD_AIX", SDTIntUnaryOp>;
+
+// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
+// model, produces an ADDIS8 instruction that adds the GOT base
+// register to sym\@got\@tlsld\@ha.
 def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+
+// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
+// ADDIS_TLSLD_L_ADDR until after register assignment.
 def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+
+// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
+// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
+// ADDIS_TLSLD_L_ADDR until after register assignment.
 def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+
+// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
+// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
+// following register assignment.
 def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
                                SDTypeProfile<1, 3, [
                                  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+
+// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
+// model, produces an ADDIS8 instruction that adds X3 to
+// sym\@dtprel\@ha.
 def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
+
+// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@dtprel\@l.
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
+// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
 def PPCpaddiDtprel   : SDNode<"PPCISD::PADDI_DTPREL", SDTIntBinOp>;
 
+// VPERM - The PPC VPERM Instruction.
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// XXSPLT - The PPC VSX splat instructions
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+
+// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
+// converting immediate single precision numbers to double precision
+// vector or scalar.
 def PPCxxspltidp : SDNode<"PPCISD::XXSPLTI_SP_TO_DP", SDT_PPCSpToDp, []>;
+
+// VECINSERT - The PPC vector insert instruction
 def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
+
+// XXPERMDI - The PPC XXPERMDI instruction
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
+
+// VECSHL - The PPC vector shift left instruction
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
+// The CMPB instruction (takes two operands of i32 or i64).
 def PPCcmpb     : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
 
 // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
 // amounts.  These nodes are generated by the multi-precision shift code.
+//
+// For scalar types, only the last `n + 1` bits of the shift amounts
+// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+// for exact behaviors.
+//
+// For vector types, only the last n bits are used. See vsld.
 def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
 def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
 def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
 
+// FNMSUB - Negated multiply-subtract instruction.
 def PPCfnmsub     : SDNode<"PPCISD::FNMSUB"    , SDTFPTernaryOp>;
 
+// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
+// word and shift left immediate.
 def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>;
 
-def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
-                               SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+// Constrained conversion from floating point to int
+let IsStrictFP = true in {
+  def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
                                 SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
+                                 SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+                                 SDTFPUnaryOp, [SDNPHasChain]>;
+}
 
 def PPCany_fctidz : PatFrags<(ops node:$op),
                              [(PPCstrict_fctidz node:$op),
@@ -294,19 +465,24 @@ def PPCany_fctiwuz : PatFrags<(ops node:$op),
                               [(PPCstrict_fctiwuz node:$op),
                                (PPCfctiwuz node:$op)]>;
 
-// Move 2 i64 values into a VSX register
+// Direct move of 2 consecutive GPR to a VSX register.
 def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
                            SDTypeProfile<1, 2,
                              [SDTCisFP<0>, SDTCisSameSizeAs<1,2>,
                               SDTCisSameAs<1,2>]>,
                            []>;
 
+// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
+// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
+// unsupported for this target.
+// Merge 2 GPRs to a single SPE register.
 def PPCbuild_spe64: SDNode<"PPCISD::BUILD_SPE64",
                            SDTypeProfile<1, 2,
                              [SDTCisVT<0, f64>, SDTCisVT<1,i32>,
                              SDTCisVT<1,i32>]>,
                            []>;
 
+// Extract SPE register component, second argument is high or low.
 def PPCextract_spe : SDNode<"PPCISD::EXTRACT_SPE",
                             SDTypeProfile<1, 2,
                               [SDTCisVT<0, i32>, SDTCisVT<1, f64>,
@@ -320,6 +496,11 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
 def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+
+// CALL - A direct function call.
+// CALL_NOP is a call with the special NOP which follows 64-bit
+// CALL_NOTOC the caller does not use the TOC.
+// SVR4 calls and 32-bit/64-bit AIX calls.
 def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
@@ -329,17 +510,28 @@ def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
 def PPCcall_notoc : SDNode<"PPCISD::CALL_NOTOC", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPVariadic]>;
+
+// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+// MTCTR instruction.
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+// BCTRL instruction.
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
+
+// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
+// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
+// and 64-bit AIX.
 def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
                                SDTypeProfile<0, 1, []>,
                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                 SDNPVariadic]>;
 
-// Call nodes for strictfp calls (that define RM).
+// The variants that implicitly define rounding mode for calls with
+// strictfp semantics.
 def PPCcall_rm  : SDNode<"PPCISD::CALL_RM", SDT_PPCCall,
                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                           SDNPVariadic]>;
@@ -357,42 +549,81 @@ def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM",
                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                    SDNPVariadic]>;
 
+// Return with a glue operand, matched by 'blr'
 def PPCretglue   : SDNode<"PPCISD::RET_GLUE", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
+// TC_RETURN - A tail call return.
+//   operand #0 chain
+//   operand #1 callee (register or absolute)
+//   operand #2 stack adjustment
+//   operand #3 optional in flag
 def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
+// EH_SJLJ_SETJMP - SjLj exception handling setjmp.
 def PPCeh_sjlj_setjmp  : SDNode<"PPCISD::EH_SJLJ_SETJMP",
                                 SDTypeProfile<1, 1, [SDTCisInt<0>,
                                                      SDTCisPtrTy<1>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
+
+// EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
 def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
 
+// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+// instructions.  For lack of better number, we use the opcode number
+// encoding for the OPC field to identify the compare.  For example, 838
+// is VCMPGTSH.
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+
+// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
+// altivec VCMP*_rec instructions.  For lack of better number, we use the
+// opcode number encoding for the OPC field to identify the compare.  For
+// example, 838 is VCMPGTSH.
 def PPCvcmp_rec   : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
 
+// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+// condition register to branch on, OPC is the branch opcode to use (e.g.
+// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+// an optional input flag argument.
 def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
-// PPC-specific atomic operations.
+// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
+// except they ensure that the compare input is zero-extended for
+// sub-word versions because the atomic loads zero-extend.
 def PPCatomicCmpSwap_8 :
   SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
          [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def PPCatomicCmpSwap_16 :
   SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
          [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+// or i32.
 def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
                            [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+// i32.
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// CHAIN,Glue = STORE_COND CHAIN, GPR, Ptr
+// The store conditional instruction ST[BHWD]ARX that produces a glue
+// result to attach it to a conditional branch.
 def PPCStoreCond  : SDNode<"PPCISD::STORE_COND", SDT_StoreCond,
                            [SDNPHasChain, SDNPMayStore,
                             SDNPMemOperand, SDNPOutGlue]>;
 
-// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
 def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
@@ -401,17 +632,44 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
 def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
+
+// The following two target-specific nodes are used for calls through
+// function pointers in the 64-bit SVR4 ABI.
+
+// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+// compute an allocation on the stack.
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+// compute an offset from native SP to the address  of the most recent
+// dynamic alloca.
 def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
+
+// To avoid stack clash, allocation is performed by block and each block is
+// probed.
 def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>;
 
 // PC Relative Specific Nodes
+
+// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
+// either through an add like PADDI or through a PC Relative load like
+// PLD.
 def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
+
+// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
+// TLS global address when using dynamic access models. This can be done
+// through an add like PADDI.
 def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR",
                                     SDTIntUnaryOp, []>;
+
+// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
+// when using local exec access models, and when prefixed instructions are
+// available. This is used with ADD_TLS to produce an add like PADDI.
 def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR",
                                     SDTIntUnaryOp, []>;
 
+// These nodes represent PPC arithmetic operations with carry.
 def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut,
                      [SDNPCommutative]>;
 def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut,
@@ -2535,6 +2793,7 @@ defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$RST), (ins f8rc:$RA, f8rc:$RB),
 
 // Reciprocal estimates.
 let mayRaiseFPException = 1 in {
+// Reciprocal estimate instructions (unary FP ops).
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$RST), (ins f8rc:$RB),
                           "fre", "$RST, $RB", IIC_FPGeneral,
                           [(set f64:$RST, (PPCfre f64:$RB))]>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 2d8c633b9fef6..bd9a999237c09 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -83,15 +83,31 @@ def SDT_PPCsetbc : SDTypeProfile<1, 1, [
 // ISA 3.1 specific PPCISD nodes.
 //
 
+// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
 def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+
+// ACC_BUILD = Build an accumulator register from 4 VSX registers.
 def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+
+// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
 def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+
+// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
+// an accumulator or pair register. This node is needed because
+// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
+// element type.
 def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
                        []>;
 def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
                         []>;
+
+// XXMFACC = This corresponds to the xxmfacc instruction.
 def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
+
+// SETBC - The ISA 3.1 (P10) SETBC instruction.
 def PPCsetbc  : SDNode<"PPCISD::SETBC",   SDT_PPCsetbc, []>;
+
+// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
 def PPCsetbcr : SDNode<"PPCISD::SETBCR",  SDT_PPCsetbc, []>;
 
 //===----------------------------------------------------------------------===//
@@ -105,7 +121,10 @@ def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
   SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
 ]>;
 
-// PPC Specific DAG Nodes.
+// LXVRZX - Load VSX Vector Rightmost and Zero Extend
+// This node represents v1i128 BUILD_VECTOR of a zero extending load
+// instruction from <byte, halfword, word, or doubleword> to i128.
+// Allows utilization of the Load VSX Vector Rightmost Instructions.
 def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 885bed670e319..d72201df5b002 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -87,31 +87,91 @@ def SDT_PPCxxperm : SDTypeProfile<1, 3, [
   SDTCisVT<0, v2f64>, SDTCisVT<1, v2f64>,
   SDTCisVT<2, v2f64>, SDTCisVT<3, v4i32>]>;
 //--------------------------- Custom PPC nodes -------------------------------//
+
+// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+// Maps directly to an lxvd2x instruction that will be followed by
+// an xxswapd.
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+// Maps directly to an stxvd2x instruction that will be preceded by
+// an xxswapd.
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
+// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
+// the vector type to load vector in big-endian element order.
 def PPCld_vec_be  : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
+// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
+// the vector type to store vector in big-endian element order.
 def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
+// or stxvd2x instruction.  The chain is necessary because the
+// sequence replaces a load and needs to provide the same number
+// of outputs.
 def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+
+// Direct move from a VSX register to a GPR
 def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
+
+// Direct move from a GPR to a VSX register (algebraic)
 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
+
+// Direct move from a GPR to a VSX register (zero)
 def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
+
+// Extract a subvector from signed integer vector and convert to FP.
+// It is primarily used to convert a (widened) illegal integer vector
+// type to a legal floating point vector type.
+// For example v2i32 -> widened to v4i32 -> v2f64
 def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
+
+// Extract a subvector from unsigned integer vector and convert to FP.
+// As with SINT_VEC_TO_FP, used for converting illegal types.
 def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
+
+// An SDNode for swaps that are not associated with any loads/stores
+// and thereby have no chain.
 def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
 
+// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
+// lower (IDX=1) half of v4f32 to v2f64.
 def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
+
+// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
+// v2f32 value into the lower half of a VSR register.
 def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// instructions such as LXVDSX, LXVWSX.
 def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// that zero-extends.
 def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// that sign-extends.
 def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
+// place the value into the least significant element of the most
+// significant doubleword in the vector. This is not element zero for
+// anything smaller than a doubleword on either endianness. This node has
+// the same semantics as SCALAR_TO_VECTOR except that the value remains in
+// the aforementioned location in the vector register.
 def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
                      SDTypeProfile<1, 1, []>, []>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index 93a4693c50168..80aa1122167df 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -7,20 +7,72 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCSelectionDAGInfo.h"
-#include "PPCISelLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+#define GET_SDNODE_DESC
+#include "PPCGenSDNodeInfo.inc"
 
 using namespace llvm;
 
+PPCSelectionDAGInfo::PPCSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(PPCGenSDNodeInfo) {}
+
 PPCSelectionDAGInfo::~PPCSelectionDAGInfo() = default;
 
-bool PPCSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= PPCISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= PPCISD::LAST_MEMORY_OPCODE;
+const char *PPCSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<PPCISD::NodeType>(Opcode)) {
+  case PPCISD::GlobalBaseReg:
+    return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRA_ADDZE:
+    return "PPCISD::SRA_ADDZE";
+  case PPCISD::READ_TIME_BASE:
+    return "PPCISD::READ_TIME_BASE";
+  case PPCISD::MFOCRF:
+    return "PPCISD::MFOCRF";
+  case PPCISD::ANDI_rec_1_EQ_BIT:
+    return "PPCISD::ANDI_rec_1_EQ_BIT";
+  case PPCISD::ANDI_rec_1_GT_BIT:
+    return "PPCISD::ANDI_rec_1_GT_BIT";
+  case PPCISD::BDNZ:
+    return "PPCISD::BDNZ";
+  case PPCISD::BDZ:
+    return "PPCISD::BDZ";
+  case PPCISD::PPC32_PICGOT:
+    return "PPCISD::PPC32_PICGOT";
+  case PPCISD::VADD_SPLAT:
+    return "PPCISD::VADD_SPLAT";
+  }
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
-bool PPCSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
-  return Opcode >= PPCISD::FIRST_STRICTFP_OPCODE &&
-         Opcode <= PPCISD::LAST_STRICTFP_OPCODE;
+void PPCSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+                                           const SDNode *N) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case PPCISD::DYNAREAOFFSET:
+    // invalid number of results; expected 2, got 1
+  case PPCISD::TOC_ENTRY:
+    // invalid number of results; expected 1, got 2
+  case PPCISD::STORE_COND:
+    // invalid number of results; expected 2, got 3
+  case PPCISD::LD_SPLAT:
+  case PPCISD::SEXT_LD_SPLAT:
+  case PPCISD::ZEXT_LD_SPLAT:
+    // invalid number of operands; expected 2, got 3
+  case PPCISD::ST_VSR_SCAL_INT:
+    // invalid number of operands; expected 4, got 5
+  case PPCISD::XXPERM:
+    // operand #1 must have type v2f64, but has type v16i8
+  case PPCISD::ACC_BUILD:
+    // operand #3 must have type v4i32, but has type v16i8
+  case PPCISD::PAIR_BUILD:
+    // operand #1 must have type v4i32, but has type v16i8
+    return;
+  }
+
+  SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
 }
 
 std::pair<SDValue, SDValue> PPCSelectionDAGInfo::EmitTargetCodeForMemcmp(
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index f962a7a5321aa..ffe8982ce1af4 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -11,15 +11,66 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "PPCGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace PPCISD {
+
+enum NodeType : unsigned {
+  /// The result of the mflr at function entry, used for PIC code.
+  GlobalBaseReg = GENERATED_OPCODE_END,
+
+  /// The combination of sra[wd]i and addze used to implemented signed
+  /// integer division by a power of 2. The first operand is the dividend,
+  /// and the second is the constant shift amount (representing the
+  /// divisor).
+  SRA_ADDZE,
+
+  /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+  /// This copies the bits corresponding to the specified CRREG into the
+  /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+  MFOCRF,
+
+  // FIXME: Remove these once the ANDI glue bug is fixed:
+  /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+  /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+  /// implement truncation of i32 or i64 to i1.
+  ANDI_rec_1_EQ_BIT,
+  ANDI_rec_1_GT_BIT,
+
+  // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
+  // target (returns (Lo, Hi)). It takes a chain operand.
+  READ_TIME_BASE,
 
-class PPCSelectionDAGInfo : public SelectionDAGTargetInfo {
+  /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+  /// loops.
+  BDNZ,
+  BDZ,
+
+  /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+  /// local dynamic TLS and position indendepent code on PPC32.
+  PPC32_PICGOT,
+
+  /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+  /// during instruction selection to optimize a BUILD_VECTOR into
+  /// operations on splats.  This is necessary to avoid losing these
+  /// optimizations due to constant folding.
+  VADD_SPLAT,
+};
+
+} // namespace PPCISD
+
+class PPCSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  PPCSelectionDAGInfo();
+
   ~PPCSelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
-  bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+  void verifyTargetNode(const SelectionDAG &DAG,
+                        const SDNode *N) const override;
 
   std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,

From 5cf5eb7714ea4d2a9a7775c1a054b9ea6556b78b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 17 Nov 2025 15:00:36 -0800
Subject: [PATCH 100/105] [MemProf] Fixup edges for largest N cold contexts
 (#167599)

We build the callsite graph by first adding nodes and edges for all
allocation contexts, then match the interior callsite nodes onto actual
calls (IR or summary), which due to inlining may result in the
generation of new nodes representing the inlined context sequence. We
attempt to update edges correctly during this process, but in the case
of recursion this becomes impossible to always get correct.
Specifically, when creating new inlined sequence nodes for stack ids on
recursive cycles we can't always update correctly, because we have lost
the original ordering of the context.

This PR introduces a mechanism, guarded by -memprof-top-n-important=
flag, to keep track of extra information for the largest N cold
contexts. Another flag -memprof-fixup-important (enabled by default)
will perform more expensive fixup of the edges for those largest N cold
contexts, by saving and walking the original ordered list of stack ids
from the context.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 262 ++++++++++++++++--
 llvm/test/ThinLTO/X86/memprof-fixup.ll        | 129 +++++++++
 .../MemProfContextDisambiguation/fixup.ll     | 105 +++++++
 3 files changed, 480 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-fixup.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index d35ae4730a9f3..0f4bc649df720 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments,
 STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
 STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
 STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
+STATISTIC(NumImportantContextIds, "Number of important context ids");
+STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
+STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
+STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+cl::opt<unsigned> MemProfTopNImportant(
+    "memprof-top-n-important", cl::init(10), cl::Hidden,
+    cl::desc("Number of largest cold contexts to consider important"));
+
+cl::opt<bool> MemProfFixupImportant(
+    "memprof-fixup-important", cl::init(true), cl::Hidden,
+    cl::desc("Enables edge fixup for important contexts"));
+
 } // namespace llvm
 
 namespace {
+
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
 /// The graph represents the call contexts in all memprof metadata on allocation
@@ -581,17 +594,26 @@ class CallsiteContextGraph {
 
   /// Adds nodes for the given MIB stack ids.
   template <class NodeT, class IteratorT>
-  void addStackNodesForMIB(ContextNode *AllocNode,
-                           CallStack<NodeT, IteratorT> &StackContext,
-                           CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType,
-                           ArrayRef<ContextTotalSize> ContextSizeInfo);
+  void addStackNodesForMIB(
+      ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+      CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+      ArrayRef<ContextTotalSize> ContextSizeInfo,
+      std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
   /// inlining performed on those callsite instructions.
   void updateStackNodes();
 
+  /// Optionally fixup edges for the N largest cold contexts to better enable
+  /// cloning. This is particularly helpful if the context includes recursion
+  /// as well as inlining, resulting in a single stack node for multiple stack
+  /// ids in the context. With recursion it is particularly difficult to get the
+  /// edge updates correct as in the general case we have lost the original
+  /// stack id ordering for the context. Do more expensive fixup for the largest
+  /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
+  void fixupImportantContexts();
+
   /// Update graph to conservatively handle any callsite stack nodes that target
   /// multiple different callee target functions.
   void handleCallsitesWithMultipleTargets();
@@ -658,7 +680,8 @@ class CallsiteContextGraph {
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
-      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+      const DenseSet<uint32_t> &ImportantContextIds);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -859,6 +882,50 @@ class CallsiteContextGraph {
   /// nodes.
   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
 
+  /// Saves information for the contexts identified as important (the largest
+  /// cold contexts up to MemProfTopNImportant).
+  struct ImportantContextInfo {
+    // The original list of leaf first stack ids corresponding to this context.
+    std::vector<uint64_t> StackIds;
+    // Max length of stack ids corresponding to a single stack ContextNode for
+    // this context (i.e. the max length of a key in StackIdsToNode below).
+    unsigned MaxLength = 0;
+    // Mapping of slices of the stack ids to the corresponding ContextNode
+    // (there can be multiple stack ids due to inlining). Populated when
+    // updating stack nodes while matching them to the IR or summary.
+    std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
+  };
+
+  // Map of important full context ids to information about each.
+  DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
+
+  // For each important context id found in Node (if any), records the list of
+  // stack ids that corresponded to the given callsite Node. There can be more
+  // than one in the case of inlining.
+  void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
+                       // We pass in the Node's context ids to avoid the
+                       // overhead of computing them as the caller already has
+                       // them in some cases.
+                       const DenseSet<uint32_t> &NodeContextIds,
+                       const DenseSet<uint32_t> &ImportantContextIds) {
+    if (!MemProfTopNImportant) {
+      assert(ImportantContextIds.empty());
+      return;
+    }
+    DenseSet<uint32_t> Ids =
+        set_intersection(NodeContextIds, ImportantContextIds);
+    if (Ids.empty())
+      return;
+    auto Size = StackIds.size();
+    for (auto Id : Ids) {
+      auto &Entry = ImportantContextIdInfo[Id];
+      Entry.StackIdsToNode[StackIds] = Node;
+      // Keep track of the max to simplify later analysis.
+      if (Size > Entry.MaxLength)
+        Entry.MaxLength = Size;
+    }
+  }
+
   /// Maps to track the calls to their corresponding nodes in the graph.
   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
@@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    ArrayRef<ContextTotalSize> ContextSizeInfo) {
+    ArrayRef<ContextTotalSize> ContextSizeInfo,
+    std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1361,8 +1429,33 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
+  bool IsImportant = false;
   if (!ContextSizeInfo.empty()) {
     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    // If this is a cold allocation, and we are collecting non-zero largest
+    // contexts, see if this is a candidate.
+    if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
+      uint64_t TotalCold = 0;
+      for (auto &CSI : ContextSizeInfo)
+        TotalCold += CSI.TotalSize;
+      // Record this context if either we haven't found the first top-n largest
+      // yet, or if it is larger than the smallest already recorded.
+      if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
+          // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
+          // sorted in ascending size of its key which is the size.
+          TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
+        if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
+          // Remove old one and its associated entries.
+          auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
+          TotalSizeToContextIdTopNCold.erase(
+              TotalSizeToContextIdTopNCold.begin());
+          assert(ImportantContextIdInfo.count(IdToRemove));
+          ImportantContextIdInfo.erase(IdToRemove);
+        }
+        TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
+        IsImportant = true;
+      }
+    }
     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
@@ -1381,6 +1474,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
        ContextIter != StackContext.end(); ++ContextIter) {
     auto StackId = getStackId(*ContextIter);
+    if (IsImportant)
+      ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
     ContextNode *StackNode = getNodeForStackId(StackId);
     if (!StackNode) {
       StackNode = createNewNode(/*IsAllocation=*/false);
@@ -1600,11 +1695,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(
-        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-        DenseMap<uint64_t, std::vector<CallContextInfo>>
-            &StackIdToMatchingCalls,
-        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
+    assignStackNodesPostOrder(ContextNode *Node,
+                              DenseSet<const ContextNode *> &Visited,
+                              DenseMap<uint64_t, std::vector<CallContextInfo>>
+                                  &StackIdToMatchingCalls,
+                              DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+                              const DenseSet<uint32_t> &ImportantContextIds) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1620,7 +1716,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       continue;
     }
     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1648,6 +1744,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       Node->setCall(Call);
       NonAllocationCallToContextNodeMap[Call] = Node;
       NodeToCallingFunc[Node] = Func;
+      recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
       return;
     }
   }
@@ -1786,6 +1883,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
                                 : CurNode->computeAllocType();
       PrevNode = CurNode;
     }
+
+    recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
+
     if (VerifyNodes) {
       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
       for (auto Id : Ids) {
@@ -1798,6 +1898,122 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   }
 }
 
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+                          CallTy>::fixupImportantContexts() {
+  if (ImportantContextIdInfo.empty())
+    return;
+
+  // Update statistics as we are done building this map at this point.
+  NumImportantContextIds = ImportantContextIdInfo.size();
+
+  if (!MemProfFixupImportant)
+    return;
+
+  if (ExportToDot)
+    exportToDot("beforestackfixup");
+
+  // For each context we identified as important, walk through the saved context
+  // stack ids in order from leaf upwards, and make sure all edges are correct.
+  // These can be difficult to get right when updating the graph while mapping
+  // nodes onto summary or IR, especially when there is recursion. In
+  // particular, when we have created new nodes to reflect inlining, it is
+  // sometimes impossible to know exactly how to update the edges in the face of
+  // recursion, as we have lost the original ordering of the stack ids in the
+  // contexts.
+  // TODO: Consider only doing this if we detect the context has recursive
+  // cycles.
+  //
+  // I.e. assume we have a context with stack ids like: {A B A C A D E}
+  // and let's say A was inlined into B, C, and D. The original graph will have
+  // multiple recursive cycles through A. When we match the original context
+  // nodes onto the IR or summary, we will merge {A B} into one context node,
+  // {A C} onto another, and {A D} onto another. Looking at the stack sequence
+  // above, we should end up with a non-cyclic set of edges like:
+  // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
+  // original ordering, we won't get the edges correct initially (it's
+  // impossible without the original ordering). Here we do the fixup (add and
+  // removing edges where necessary) for this context. In the
+  // ImportantContextInfo struct in this case we should have a MaxLength = 2,
+  // and map entries for {A B}, {A C}, {A D}, and {E}.
+  for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
+    if (Info.StackIdsToNode.empty())
+      continue;
+    bool Changed = false;
+    ContextNode *PrevNode = nullptr;
+    ContextNode *CurNode = nullptr;
+    DenseSet<const ContextEdge *> VisitedEdges;
+    ArrayRef<uint64_t> AllStackIds(Info.StackIds);
+    // Try to identify what callsite ContextNode maps to which slice of the
+    // context's ordered stack ids.
+    for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
+      // We will do this greedily, trying up to MaxLength stack ids in a row, to
+      // see if we recorded a context node for that sequence.
+      auto Len = Info.MaxLength;
+      auto LenToEnd = AllStackIds.size() - I;
+      if (Len > LenToEnd)
+        Len = LenToEnd;
+      CurNode = nullptr;
+      // Try to find a recorded context node starting with the longest length
+      // recorded, and on down until we check for just a single stack node.
+      for (; Len > 0; Len--) {
+        // Get the slice of the original stack id sequence to check.
+        auto CheckStackIds = AllStackIds.slice(I, Len);
+        auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
+        if (EntryIt == Info.StackIdsToNode.end())
+          continue;
+        CurNode = EntryIt->second;
+        // Skip forward so we don't try to look for the ones we just matched.
+        // We increment by Len - 1, because the outer for loop will increment I.
+        I += Len - 1;
+        break;
+      }
+      // Give up if we couldn't find a node. Since we need to clone from the
+      // leaf allocation upwards, no sense in doing anymore fixup further up
+      // the context if we couldn't match part of the original stack context
+      // onto a callsite node.
+      if (!CurNode)
+        break;
+      // No edges to fix up until we have a pair of nodes that should be
+      // adjacent in the graph.
+      if (!PrevNode)
+        continue;
+      // See if we already have a call edge from CurNode to PrevNode.
+      auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
+      if (CurEdge) {
+        // We already have an edge. Make sure it contains this context id.
+        if (CurEdge->getContextIds().insert(CurContextId).second) {
+          NumFixupEdgeIdsInserted++;
+          Changed = true;
+        }
+      } else {
+        // No edge exists - add one.
+        NumFixupEdgesAdded++;
+        DenseSet<uint32_t> ContextIds({CurContextId});
+        auto AllocType = computeAllocType(ContextIds);
+        auto NewEdge = std::make_shared<ContextEdge>(
+            PrevNode, CurNode, AllocType, std::move(ContextIds));
+        PrevNode->CallerEdges.push_back(NewEdge);
+        CurNode->CalleeEdges.push_back(NewEdge);
+        // Save the new edge for the below handling.
+        CurEdge = NewEdge.get();
+        Changed = true;
+      }
+      VisitedEdges.insert(CurEdge);
+      // Now remove this context id from any other caller edges calling
+      // PrevNode.
+      for (auto &Edge : PrevNode->CallerEdges) {
+        // Skip the edge updating/created above and edges we have already
+        // visited (due to recursion).
+        if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
+          Edge->getContextIds().erase(CurContextId);
+      }
+    }
+    if (Changed)
+      NumFixedContexts++;
+  }
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // Map of stack id to all calls with that as the last (outermost caller)
@@ -2043,9 +2259,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // nodes representing any inlining at interior callsites. Note we move the
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
+  DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
+                                         ImportantContextIdInfo.keys());
   for (auto &Entry : AllocationCallToContextNodeMap)
     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
+
+  fixupImportantContexts();
+
   if (VerifyCCG)
     check();
 }
@@ -2155,6 +2376,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
     Module &M,
     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
     : Mod(M), OREGetter(OREGetter) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &F : M) {
     std::vector<CallInfo> CallsWithMetadata;
     for (auto &BB : F) {
@@ -2191,7 +2416,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), ContextSizeInfo);
+                getMIBAllocType(MIBMD), ContextSizeInfo,
+                TotalSizeToContextIdTopNCold);
           }
           // If exporting the graph to dot and an allocation id of interest was
           // specified, record all the context ids for this allocation node.
@@ -2241,6 +2467,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing)
     : Index(Index), isPrevailing(isPrevailing) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &I : Index) {
     auto VI = Index.getValueInfo(I);
     for (auto &S : VI.getSummaryList()) {
@@ -2288,7 +2518,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
             }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                ContextSizeInfo);
+                ContextSizeInfo, TotalSizeToContextIdTopNCold);
             I++;
           }
           // If exporting the graph to dot and an allocation id of interest was
diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll
new file mode 100644
index 0000000000000..afed80fc562c1
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll
@@ -0,0 +1,129 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; Need context sizes in summary, so enable reporting.
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
+
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Allow default detection of the largest cold contexts, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: created clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
new file mode 100644
index 0000000000000..a08f89b5bbe97
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
@@ -0,0 +1,105 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Allow default detection of the largest cold contexts, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: created clone DB.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}

From 7a14ef0293ba8eb6942118e560b8bb7208822291 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:25:33 -0800
Subject: [PATCH 101/105] [scudo] Skip test if mlock fails. (#168448)

Some linux versions might not support the mlock call, so skip that part
of the test if the mlock fails.
---
 .../lib/scudo/standalone/tests/map_test.cpp   | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/tests/map_test.cpp b/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
index afdfe5be85fb6..9d1a35c44679d 100644
--- a/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
@@ -120,17 +120,17 @@ TEST(ScudoMapTest, Zeroing) {
 #if SCUDO_LINUX
   // Now verify that if madvise fails, the data is still zeroed.
   memset(Data, 1U, MemMap.getCapacity());
-  EXPECT_NE(-1, mlock(Data, MemMap.getCapacity()));
-
-  EXPECT_EQ(1U, Data[0]);
-  EXPECT_EQ(1U, Data[PageSize]);
-  EXPECT_EQ(1U, Data[PageSize * 2]);
-  MemMap.releaseAndZeroPagesToOS(MemMap.getBase(), MemMap.getCapacity());
-  EXPECT_EQ(0U, Data[0]);
-  EXPECT_EQ(0U, Data[PageSize]);
-  EXPECT_EQ(0U, Data[PageSize * 2]);
-
-  EXPECT_NE(-1, munlock(Data, MemMap.getCapacity()));
+  if (mlock(Data, MemMap.getCapacity()) != -1) {
+    EXPECT_EQ(1U, Data[0]);
+    EXPECT_EQ(1U, Data[PageSize]);
+    EXPECT_EQ(1U, Data[PageSize * 2]);
+    MemMap.releaseAndZeroPagesToOS(MemMap.getBase(), MemMap.getCapacity());
+    EXPECT_EQ(0U, Data[0]);
+    EXPECT_EQ(0U, Data[PageSize]);
+    EXPECT_EQ(0U, Data[PageSize * 2]);
+
+    EXPECT_NE(-1, munlock(Data, MemMap.getCapacity()));
+  }
 #endif
 
   MemMap.unmap();

From 3bec613d88be5e9ca52230157c713f0cb80b820d Mon Sep 17 00:00:00 2001
From: David Peixotto <peix@meta.com>
Date: Mon, 17 Nov 2025 15:27:23 -0800
Subject: [PATCH 102/105] [lldb] Add helper to create mock objects for dwarf
 expression tests (#167956)

This commit adds a new helper function that creates various mock objects
that can be used in dwarf expression testing. The optional register
value and memory contents are used to create MockProcessWithMemRead and
MockRegisterContext that can return expected memory contents and
register values.

This simplifies some tests by removing redundant code that creates these
objects in individual tests and consolidates the logic into one place.
---
 .../Expression/DWARFExpressionTest.cpp        | 221 +++++++-----------
 1 file changed, 89 insertions(+), 132 deletions(-)

diff --git a/lldb/unittests/Expression/DWARFExpressionTest.cpp b/lldb/unittests/Expression/DWARFExpressionTest.cpp
index 8c5568d9e4e65..13110ef7cbb0a 100644
--- a/lldb/unittests/Expression/DWARFExpressionTest.cpp
+++ b/lldb/unittests/Expression/DWARFExpressionTest.cpp
@@ -67,6 +67,33 @@ struct MockProcess : Process {
   }
 };
 
+/// A Process whose `ReadMemory` override queries a DenseMap.
+struct MockProcessWithMemRead : Process {
+  using addr_t = lldb::addr_t;
+
+  llvm::DenseMap<addr_t, addr_t> memory_map;
+
+  MockProcessWithMemRead(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp,
+                         llvm::DenseMap<addr_t, addr_t> &&memory_map)
+      : Process(target_sp, listener_sp), memory_map(memory_map) {}
+  size_t DoReadMemory(addr_t vm_addr, void *buf, size_t size,
+                      Status &error) override {
+    assert(memory_map.contains(vm_addr));
+    assert(size == sizeof(addr_t));
+    *reinterpret_cast<addr_t *>(buf) = memory_map[vm_addr];
+    return sizeof(addr_t);
+  }
+  size_t ReadMemory(addr_t addr, void *buf, size_t size,
+                    Status &status) override {
+    return DoReadMemory(addr, buf, size, status);
+  }
+  bool CanDebug(lldb::TargetSP, bool) override { return true; }
+  Status DoDestroy() override { return Status(); }
+  llvm::StringRef GetPluginName() override { return ""; }
+  void RefreshStateAfterStop() override {}
+  bool DoUpdateThreadList(ThreadList &, ThreadList &) override { return false; }
+};
+
 class MockThread : public Thread {
 public:
   MockThread(Process &process) : Thread(process, /*tid=*/1), m_reg_ctx_sp() {}
@@ -175,24 +202,55 @@ class DWARFExpressionMockProcessTest : public ::testing::Test {
   }
 };
 
-struct PlatformTargetDebugger {
+struct TestContext {
   lldb::PlatformSP platform_sp;
   lldb::TargetSP target_sp;
   lldb::DebuggerSP debugger_sp;
+  lldb::ProcessSP process_sp;
+  lldb::ThreadSP thread_sp;
+  lldb::RegisterContextSP reg_ctx_sp;
 };
 
-/// A helper function to create <Platform, Target, Debugger> objects with the
-/// "aarch64-pc-linux" ArchSpec.
-static PlatformTargetDebugger CreateTarget() {
-  ArchSpec arch("aarch64-pc-linux");
+/// A helper function to create TestContext objects with the
+/// given triple, memory, and register contents.
+static bool CreateTestContext(
+    TestContext *ctx, llvm::StringRef triple,
+    std::optional<RegisterValue> reg_value = {},
+    std::optional<llvm::DenseMap<lldb::addr_t, lldb::addr_t>> memory = {}) {
+  ArchSpec arch(triple);
   Platform::SetHostPlatform(
       platform_linux::PlatformLinux::CreateInstance(true, &arch));
   lldb::PlatformSP platform_sp;
   lldb::TargetSP target_sp;
   lldb::DebuggerSP debugger_sp = Debugger::CreateInstance();
-  debugger_sp->GetTargetList().CreateTarget(
+  Status status = debugger_sp->GetTargetList().CreateTarget(
       *debugger_sp, "", arch, eLoadDependentsNo, platform_sp, target_sp);
-  return PlatformTargetDebugger{platform_sp, target_sp, debugger_sp};
+
+  EXPECT_TRUE(status.Success());
+  if (!status.Success())
+    return false;
+
+  lldb::ProcessSP process_sp;
+  if (memory)
+    process_sp = std::make_shared<MockProcessWithMemRead>(
+        target_sp, Listener::MakeListener("dummy"), std::move(*memory));
+  else
+    process_sp = std::make_shared<MockProcess>(target_sp,
+                                               Listener::MakeListener("dummy"));
+
+  auto thread_sp = std::make_shared<MockThread>(*process_sp);
+
+  process_sp->GetThreadList().AddThread(thread_sp);
+
+  lldb::RegisterContextSP reg_ctx_sp;
+  if (reg_value) {
+    reg_ctx_sp = std::make_shared<MockRegisterContext>(*thread_sp, *reg_value);
+    thread_sp->SetRegisterContext(reg_ctx_sp);
+  }
+
+  *ctx = TestContext{platform_sp, target_sp, debugger_sp,
+                     process_sp,  thread_sp, reg_ctx_sp};
+  return true;
 }
 
 // NB: This class doesn't use the override keyword to avoid
@@ -486,24 +544,10 @@ TEST_F(DWARFExpressionMockProcessTest, DW_OP_deref) {
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit0, DW_OP_deref}), llvm::Failed());
 
   // Set up a mock process.
-  ArchSpec arch("i386-pc-linux");
-  Platform::SetHostPlatform(
-      platform_linux::PlatformLinux::CreateInstance(true, &arch));
-  lldb::DebuggerSP debugger_sp = Debugger::CreateInstance();
-  ASSERT_TRUE(debugger_sp);
-  lldb::TargetSP target_sp;
-  lldb::PlatformSP platform_sp;
-  debugger_sp->GetTargetList().CreateTarget(
-      *debugger_sp, "", arch, eLoadDependentsNo, platform_sp, target_sp);
-  ASSERT_TRUE(target_sp);
-  ASSERT_TRUE(target_sp->GetArchitecture().IsValid());
-  ASSERT_TRUE(platform_sp);
-  lldb::ListenerSP listener_sp(Listener::MakeListener("dummy"));
-  lldb::ProcessSP process_sp =
-      std::make_shared<MockProcess>(target_sp, listener_sp);
-  ASSERT_TRUE(process_sp);
+  TestContext test_ctx;
+  ASSERT_TRUE(CreateTestContext(&test_ctx, "i386-pc-linux"));
 
-  ExecutionContext exe_ctx(process_sp);
+  ExecutionContext exe_ctx(test_ctx.process_sp);
   // Implicit location: *0x4.
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_lit4, DW_OP_deref, DW_OP_stack_value}, {}, {}, &exe_ctx),
@@ -518,20 +562,10 @@ TEST_F(DWARFExpressionMockProcessTest, DW_OP_deref) {
 
 TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr) {
   // Set up a wasm target
-  ArchSpec arch("wasm32-unknown-unknown-wasm");
-  lldb::PlatformSP host_platform_sp =
-      platform_linux::PlatformLinux::CreateInstance(true, &arch);
-  ASSERT_TRUE(host_platform_sp);
-  Platform::SetHostPlatform(host_platform_sp);
-  lldb::DebuggerSP debugger_sp = Debugger::CreateInstance();
-  ASSERT_TRUE(debugger_sp);
-  lldb::TargetSP target_sp;
-  lldb::PlatformSP platform_sp;
-  debugger_sp->GetTargetList().CreateTarget(*debugger_sp, "", arch,
-                                            lldb_private::eLoadDependentsNo,
-                                            platform_sp, target_sp);
+  TestContext test_ctx;
+  ASSERT_TRUE(CreateTestContext(&test_ctx, "wasm32-unknown-unknown-wasm"));
 
-  ExecutionContext exe_ctx(target_sp, false);
+  ExecutionContext exe_ctx(test_ctx.target_sp, false);
   // DW_OP_addr takes a single operand of address size width:
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_addr, 0x40, 0x0, 0x0, 0x0}, {}, {}, &exe_ctx),
@@ -587,20 +621,9 @@ TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr_index) {
   dwarf_cu->ExtractDIEsIfNeeded();
 
   // Set up a wasm target
-  ArchSpec arch("wasm32-unknown-unknown-wasm");
-  lldb::PlatformSP host_platform_sp =
-      platform_linux::PlatformLinux::CreateInstance(true, &arch);
-  ASSERT_TRUE(host_platform_sp);
-  Platform::SetHostPlatform(host_platform_sp);
-  lldb::DebuggerSP debugger_sp = Debugger::CreateInstance();
-  ASSERT_TRUE(debugger_sp);
-  lldb::TargetSP target_sp;
-  lldb::PlatformSP platform_sp;
-  debugger_sp->GetTargetList().CreateTarget(*debugger_sp, "", arch,
-                                            lldb_private::eLoadDependentsNo,
-                                            platform_sp, target_sp);
-
-  ExecutionContext exe_ctx(target_sp, false);
+  TestContext test_ctx;
+  ASSERT_TRUE(CreateTestContext(&test_ctx, "wasm32-unknown-unknown-wasm"));
+  ExecutionContext exe_ctx(test_ctx.target_sp, false);
 
   auto evaluate = [&](DWARFExpression &expr) -> llvm::Expected<Value> {
     DataExtractor extractor;
@@ -823,28 +846,10 @@ TEST(DWARFExpression, Extensions) {
       subsystems;
 
   // Set up a wasm target.
-  ArchSpec arch("wasm32-unknown-unknown-wasm");
-  lldb::PlatformSP host_platform_sp =
-      platform_linux::PlatformLinux::CreateInstance(true, &arch);
-  ASSERT_TRUE(host_platform_sp);
-  Platform::SetHostPlatform(host_platform_sp);
-  lldb::DebuggerSP debugger_sp = Debugger::CreateInstance();
-  ASSERT_TRUE(debugger_sp);
-  lldb::TargetSP target_sp;
-  lldb::PlatformSP platform_sp;
-  debugger_sp->GetTargetList().CreateTarget(*debugger_sp, "", arch,
-                                            lldb_private::eLoadDependentsNo,
-                                            platform_sp, target_sp);
-  // Set up a mock process and thread.
-  lldb::ListenerSP listener_sp(Listener::MakeListener("dummy"));
-  lldb::ProcessSP process_sp =
-      std::make_shared<MockProcess>(target_sp, listener_sp);
-  ASSERT_TRUE(process_sp);
-  MockThread thread(*process_sp);
+  TestContext test_ctx;
   const uint32_t kExpectedValue = 42;
-  lldb::RegisterContextSP reg_ctx_sp = std::make_shared<MockRegisterContext>(
-      thread, RegisterValue(kExpectedValue));
-  thread.SetRegisterContext(reg_ctx_sp);
+  ASSERT_TRUE(CreateTestContext(&test_ctx, "wasm32-unknown-unknown-wasm",
+                                RegisterValue(kExpectedValue)));
 
   llvm::Expected<TestFile> file = TestFile::fromYaml(yamldata);
   EXPECT_THAT_EXPECTED(file, llvm::Succeeded());
@@ -853,7 +858,8 @@ TEST(DWARFExpression, Extensions) {
   SymbolFileWasm sym_file_wasm(obj_file_sp, nullptr);
   auto *dwarf_unit = sym_file_wasm.DebugInfo().GetUnitAtIndex(0);
 
-  testExpressionVendorExtensions(module_sp, *dwarf_unit, reg_ctx_sp.get());
+  testExpressionVendorExtensions(module_sp, *dwarf_unit,
+                                 test_ctx.reg_ctx_sp.get());
 }
 
 TEST(DWARFExpression, ExtensionsSplitSymbols) {
@@ -1022,28 +1028,10 @@ TEST(DWARFExpression, ExtensionsSplitSymbols) {
       subsystems;
 
   // Set up a wasm target.
-  ArchSpec arch("wasm32-unknown-unknown-wasm");
-  lldb::PlatformSP host_platform_sp =
-      platform_linux::PlatformLinux::CreateInstance(true, &arch);
-  ASSERT_TRUE(host_platform_sp);
-  Platform::SetHostPlatform(host_platform_sp);
-  lldb::DebuggerSP debugger_sp = Debugger::CreateInstance();
-  ASSERT_TRUE(debugger_sp);
-  lldb::TargetSP target_sp;
-  lldb::PlatformSP platform_sp;
-  debugger_sp->GetTargetList().CreateTarget(*debugger_sp, "", arch,
-                                            lldb_private::eLoadDependentsNo,
-                                            platform_sp, target_sp);
-  // Set up a mock process and thread.
-  lldb::ListenerSP listener_sp(Listener::MakeListener("dummy"));
-  lldb::ProcessSP process_sp =
-      std::make_shared<MockProcess>(target_sp, listener_sp);
-  ASSERT_TRUE(process_sp);
-  MockThread thread(*process_sp);
+  TestContext test_ctx;
   const uint32_t kExpectedValue = 42;
-  lldb::RegisterContextSP reg_ctx_sp = std::make_shared<MockRegisterContext>(
-      thread, RegisterValue(kExpectedValue));
-  thread.SetRegisterContext(reg_ctx_sp);
+  ASSERT_TRUE(CreateTestContext(&test_ctx, "wasm32-unknown-unknown-wasm",
+                                RegisterValue(kExpectedValue)));
 
   llvm::Expected<TestFile> skeleton_file =
       TestFile::fromYaml(skeleton_yamldata);
@@ -1059,7 +1047,8 @@ TEST(DWARFExpression, ExtensionsSplitSymbols) {
   SymbolFileWasm sym_file_wasm(obj_file_sp, nullptr);
   auto *dwarf_unit = sym_file_wasm.DebugInfo().GetUnitAtIndex(0);
 
-  testExpressionVendorExtensions(sym_module_sp, *dwarf_unit, reg_ctx_sp.get());
+  testExpressionVendorExtensions(sym_module_sp, *dwarf_unit,
+                                 test_ctx.reg_ctx_sp.get());
 }
 
 TEST_F(DWARFExpressionMockProcessTest, DW_OP_piece_file_addr) {
@@ -1092,33 +1081,6 @@ TEST_F(DWARFExpressionMockProcessTest, DW_OP_piece_file_addr) {
                        ExpectHostAddress({0x11, 0x22}));
 }
 
-/// A Process whose `ReadMemory` override queries a DenseMap.
-struct MockProcessWithMemRead : Process {
-  using addr_t = lldb::addr_t;
-
-  llvm::DenseMap<addr_t, addr_t> memory_map;
-
-  MockProcessWithMemRead(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp,
-                         llvm::DenseMap<addr_t, addr_t> &&memory_map)
-      : Process(target_sp, listener_sp), memory_map(memory_map) {}
-  size_t DoReadMemory(addr_t vm_addr, void *buf, size_t size,
-                      Status &error) override {
-    assert(memory_map.contains(vm_addr));
-    assert(size == sizeof(addr_t));
-    *reinterpret_cast<addr_t *>(buf) = memory_map[vm_addr];
-    return sizeof(addr_t);
-  }
-  size_t ReadMemory(addr_t addr, void *buf, size_t size,
-                    Status &status) override {
-    return DoReadMemory(addr, buf, size, status);
-  }
-  bool CanDebug(lldb::TargetSP, bool) override { return true; }
-  Status DoDestroy() override { return Status(); }
-  llvm::StringRef GetPluginName() override { return ""; }
-  void RefreshStateAfterStop() override {}
-  bool DoUpdateThreadList(ThreadList &, ThreadList &) override { return false; }
-};
-
 class DWARFExpressionMockProcessTestWithAArch
     : public DWARFExpressionMockProcessTest {
 public:
@@ -1149,18 +1111,13 @@ TEST_F(DWARFExpressionMockProcessTestWithAArch, DW_op_deref_no_ptr_fixing) {
   constexpr lldb::addr_t addr = 42;
   memory[addr] = expected_value;
 
-  PlatformTargetDebugger test_setup = CreateTarget();
-  lldb::ProcessSP process_sp = std::make_shared<MockProcessWithMemRead>(
-      test_setup.target_sp, Listener::MakeListener("dummy"), std::move(memory));
-  auto thread = std::make_shared<MockThread>(*process_sp);
-  lldb::RegisterContextSP reg_ctx_sp =
-      std::make_shared<MockRegisterContext>(*thread, RegisterValue(addr));
-  thread->SetRegisterContext(reg_ctx_sp);
-  process_sp->GetThreadList().AddThread(thread);
+  TestContext test_ctx;
+  ASSERT_TRUE(CreateTestContext(&test_ctx, "aarch64-pc-linux",
+                                RegisterValue(addr), std::move(memory)));
 
   auto evaluate_expr = [&](auto &expr_data) {
-    ExecutionContext exe_ctx(process_sp);
-    return Evaluate(expr_data, {}, {}, &exe_ctx, reg_ctx_sp.get());
+    ExecutionContext exe_ctx(test_ctx.process_sp);
+    return Evaluate(expr_data, {}, {}, &exe_ctx, test_ctx.reg_ctx_sp.get());
   };
 
   uint8_t expr_reg[] = {DW_OP_breg22, 0};

From b48f29356641102a52ac8aa05f007bfce719df24 Mon Sep 17 00:00:00 2001
From: Jeremy Furtek <jfurtek@nvidia.com>
Date: Mon, 17 Nov 2025 17:30:39 -0600
Subject: [PATCH 103/105] Fix build breakage (MLIR LLVM dialect requires
 MLIRInferIntRangeInterface) (#168440)

This MR fixes a recent build breakage by this MR:
https://github.com/llvm/llvm-project/pull/166648

(Post-merge build error here:
https://lab.llvm.org/buildbot/#/builders/138/builds/21929)

The `MLIRInferIntRangeInterface` library is now a public dependency of
`MLIRLLVMDialect`.
---
 mlir/lib/Dialect/LLVMIR/CMakeLists.txt       | 1 +
 mlir/unittests/Dialect/LLVMIR/CMakeLists.txt | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
index cc66face1c002..a73f0c1278ec0 100644
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -31,6 +31,7 @@ add_mlir_dialect_library(MLIRLLVMDialect
   MLIRControlFlowInterfaces
   MLIRDataLayoutInterfaces
   MLIRFunctionInterfaces
+  MLIRInferIntRangeInterface
   MLIRInferTypeOpInterface
   MLIRIR
   MLIRMemorySlotInterfaces
diff --git a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
index 568126fd342cc..7cc130d02ad74 100644
--- a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
@@ -4,5 +4,4 @@ add_mlir_unittest(MLIRLLVMIRTests
 mlir_target_link_libraries(MLIRLLVMIRTests
   PRIVATE
   MLIRLLVMDialect
-  MLIRInferIntRangeInterface
   )

From da61dd28c6dd77901058580e391cb8c88bb506f2 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Mon, 17 Nov 2025 15:43:42 -0800
Subject: [PATCH 104/105] [libc] Move mbtowc, mbstowcs and inverse functions to
 stdlib.h (#168455)

These functions should be declared in `stdlib.h`, not `wchar.h`, as
confusing as it is. Move them to the proper header file and matching
directories in src/ and test/ trees.

This was discovered while testing libc++ build against llvm-libc, which
re-declares functions like mbtowc in std-namespace in `<cstdlib>`
header, and then uses those functions in its locale implementation.
---
 libc/config/linux/x86_64/entrypoints.txt      |  8 +--
 libc/include/stdlib.yaml                      | 32 ++++++++++
 libc/include/wchar.yaml                       | 31 ----------
 libc/src/stdlib/CMakeLists.txt                | 59 +++++++++++++++++++
 libc/src/{wchar => stdlib}/mbstowcs.cpp       |  2 +-
 libc/src/{wchar => stdlib}/mbstowcs.h         |  6 +-
 libc/src/{wchar => stdlib}/mbtowc.cpp         |  2 +-
 libc/src/{wchar => stdlib}/mbtowc.h           |  6 +-
 libc/src/{wchar => stdlib}/wcstombs.cpp       |  2 +-
 libc/src/{wchar => stdlib}/wcstombs.h         |  6 +-
 libc/src/{wchar => stdlib}/wctomb.cpp         |  2 +-
 libc/src/{wchar => stdlib}/wctomb.h           |  6 +-
 libc/src/wchar/CMakeLists.txt                 | 59 -------------------
 libc/test/src/stdlib/CMakeLists.txt           | 50 ++++++++++++++++
 .../src/{wchar => stdlib}/mbstowcs_test.cpp   |  2 +-
 .../src/{wchar => stdlib}/mbtowc_test.cpp     |  2 +-
 .../src/{wchar => stdlib}/wcstombs_test.cpp   |  2 +-
 .../src/{wchar => stdlib}/wctomb_test.cpp     |  2 +-
 libc/test/src/wchar/CMakeLists.txt            | 50 ----------------
 19 files changed, 165 insertions(+), 164 deletions(-)
 rename libc/src/{wchar => stdlib}/mbstowcs.cpp (97%)
 rename libc/src/{wchar => stdlib}/mbstowcs.h (83%)
 rename libc/src/{wchar => stdlib}/mbtowc.cpp (97%)
 rename libc/src/{wchar => stdlib}/mbtowc.h (84%)
 rename libc/src/{wchar => stdlib}/wcstombs.cpp (97%)
 rename libc/src/{wchar => stdlib}/wcstombs.h (83%)
 rename libc/src/{wchar => stdlib}/wctomb.cpp (97%)
 rename libc/src/{wchar => stdlib}/wctomb.h (83%)
 rename libc/test/src/{wchar => stdlib}/mbstowcs_test.cpp (99%)
 rename libc/test/src/{wchar => stdlib}/mbtowc_test.cpp (99%)
 rename libc/test/src/{wchar => stdlib}/wcstombs_test.cpp (98%)
 rename libc/test/src/{wchar => stdlib}/wctomb_test.cpp (98%)

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 5036c9438a503..910bdc53cbbc5 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1254,7 +1254,11 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdlib.atexit
     libc.src.stdlib.exit
     libc.src.stdlib.getenv
+    libc.src.stdlib.mbstowcs
+    libc.src.stdlib.mbtowc
     libc.src.stdlib.quick_exit
+    libc.src.stdlib.wcstombs
+    libc.src.stdlib.wctomb
 
     # signal.h entrypoints
     libc.src.signal.kill
@@ -1372,13 +1376,9 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.wchar.mbrlen
     libc.src.wchar.mbsinit
     libc.src.wchar.mbrtowc
-    libc.src.wchar.mbtowc
-    libc.src.wchar.mbstowcs
     libc.src.wchar.mbsrtowcs
     libc.src.wchar.mbsnrtowcs
     libc.src.wchar.wcrtomb
-    libc.src.wchar.wctomb
-    libc.src.wchar.wcstombs
     libc.src.wchar.wcsrtombs
     libc.src.wchar.wcsnrtombs
 
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index 495eb7e1317b6..4752244279243 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -17,6 +17,7 @@ types:
   - type_name: lldiv_t
   - type_name: locale_t
   - type_name: size_t
+  - type_name: wchar_t  
 enums: []
 objects: []
 functions:
@@ -135,6 +136,22 @@ functions:
     arguments:
       - type: long long
       - type: long long
+  - name: mbstowcs
+    standards:
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: wchar_t *__restrict
+      - type: const char *__restrict
+      - type: size_t
+  - name: mbtowc
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: wchar_t *__restrict
+      - type: const char *__restrict
+      - type: size_t
   - name: memalignment
     standards:
       - stdc
@@ -332,3 +349,18 @@ functions:
     return_type: int
     arguments:
       - type: const char *
+  - name: wctomb
+    standards:
+      - stdc
+    return_type: int
+    arguments:
+      - type: char *
+      - type: wchar_t
+  - name: wcstombs
+    standards:
+      - stdc
+    return_type: size_t
+    arguments:
+      - type: char *__restrict
+      - type: const wchar_t *__restrict
+      - type: size_t
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index a524c7f56bed0..7a94f9b542b7f 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -50,14 +50,6 @@ functions:
       - type: const char *__restrict
       - type: size_t
       - type: mbstate_t *__restrict
-  - name: mbtowc
-    standards:
-      - stdc
-    return_type: int
-    arguments:
-      - type: wchar_t *__restrict
-      - type: const char *__restrict
-      - type: size_t
   - name: mbsnrtowcs
     standards:
       - stdc
@@ -77,14 +69,6 @@ functions:
       - type: const char **__restrict
       - type: size_t
       - type: mbstate_t *__restrict
-  - name: mbstowcs
-    standards:
-      - stdc
-    return_type: size_t
-    arguments:
-      - type: wchar_t *__restrict
-      - type: const char *__restrict
-      - type: size_t
   - name: mbsinit
     standards:
       - stdc
@@ -269,13 +253,6 @@ functions:
       - type: char *__restrict
       - type: wchar_t
       - type: mbstate_t *__restrict
-  - name: wctomb
-    standards:
-      - stdc
-    return_type: int
-    arguments:
-      - type: char *
-      - type: wchar_t
   - name: wcscpy
     standards:
       - stdc
@@ -336,14 +313,6 @@ functions:
       - type: const wchar_t *__restrict
       - type: wchar_t **__restrict
       - type: int
-  - name: wcstombs
-    standards:
-      - stdc
-    return_type: size_t
-    arguments:
-      - type: char *__restrict
-      - type: const wchar_t *__restrict
-      - type: size_t
   - name: wcstoul
     standards:
       - stdc
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 1ccdcc8bec148..62da469f0eb9e 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -368,6 +368,65 @@ add_entrypoint_object(
     libc.hdr.types.size_t
 )
 
+add_entrypoint_object(
+  mbtowc
+  SRCS
+    mbtowc.cpp
+  HDRS
+    mbtowc.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.mbrtowc
+    libc.src.__support.wchar.mbstate
+)
+
+add_entrypoint_object(
+  mbstowcs
+  SRCS
+    mbstowcs.cpp
+  HDRS
+    mbstowcs.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.types.wchar_t
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.null_check
+    libc.src.__support.libc_errno
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.mbsnrtowcs
+)
+
+add_entrypoint_object(
+  wctomb
+  SRCS
+    wctomb.cpp
+  HDRS
+    wctomb.h
+  DEPENDS
+    libc.hdr.types.wchar_t
+    libc.src.__support.wchar.wcrtomb
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.libc_errno
+)
+
+add_entrypoint_object(
+  wcstombs
+  SRCS
+    wcstombs.cpp
+  HDRS
+    wcstombs.h
+  DEPENDS
+    libc.hdr.types.wchar_t
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.wcsnrtombs
+    libc.src.__support.libc_errno
+)
+
 if(NOT LIBC_TARGET_OS_IS_BAREMETAL AND NOT LIBC_TARGET_OS_IS_GPU)
   if(LLVM_LIBC_INCLUDE_SCUDO)
     set(SCUDO_DEPS "")
diff --git a/libc/src/wchar/mbstowcs.cpp b/libc/src/stdlib/mbstowcs.cpp
similarity index 97%
rename from libc/src/wchar/mbstowcs.cpp
rename to libc/src/stdlib/mbstowcs.cpp
index 43e953cdf2ac2..6d283ea46e3b9 100644
--- a/libc/src/wchar/mbstowcs.cpp
+++ b/libc/src/stdlib/mbstowcs.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/wchar/mbstowcs.h"
+#include "src/stdlib/mbstowcs.h"
 
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
diff --git a/libc/src/wchar/mbstowcs.h b/libc/src/stdlib/mbstowcs.h
similarity index 83%
rename from libc/src/wchar/mbstowcs.h
rename to libc/src/stdlib/mbstowcs.h
index 7d08a838b2324..90f8195a39ec5 100644
--- a/libc/src/wchar/mbstowcs.h
+++ b/libc/src/stdlib/mbstowcs.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_WCHAR_MBSTOWCS_H
-#define LLVM_LIBC_SRC_WCHAR_MBSTOWCS_H
+#ifndef LLVM_LIBC_SRC_STDLIB_MBSTOWCS_H
+#define LLVM_LIBC_SRC_STDLIB_MBSTOWCS_H
 
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
@@ -19,4 +19,4 @@ size_t mbstowcs(wchar_t *__restrict pwcs, const char *__restrict s, size_t n);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_WCHAR_MBSTOWCS_H
+#endif // LLVM_LIBC_SRC_STDLIB_MBSTOWCS_H
diff --git a/libc/src/wchar/mbtowc.cpp b/libc/src/stdlib/mbtowc.cpp
similarity index 97%
rename from libc/src/wchar/mbtowc.cpp
rename to libc/src/stdlib/mbtowc.cpp
index 6d099d43da5fa..5f482463f4711 100644
--- a/libc/src/wchar/mbtowc.cpp
+++ b/libc/src/stdlib/mbtowc.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/wchar/mbtowc.h"
+#include "src/stdlib/mbtowc.h"
 
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
diff --git a/libc/src/wchar/mbtowc.h b/libc/src/stdlib/mbtowc.h
similarity index 84%
rename from libc/src/wchar/mbtowc.h
rename to libc/src/stdlib/mbtowc.h
index f974197f81b58..acd85cb77ba76 100644
--- a/libc/src/wchar/mbtowc.h
+++ b/libc/src/stdlib/mbtowc.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_WCHAR_MBTOWC_H
-#define LLVM_LIBC_SRC_WCHAR_MBTOWC_H
+#ifndef LLVM_LIBC_SRC_STDLIB_MBTOWC_H
+#define LLVM_LIBC_SRC_STDLIB_MBTOWC_H
 
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
@@ -19,4 +19,4 @@ int mbtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_WCHAR_MBTOWC_H
+#endif // LLVM_LIBC_SRC_STDLIB_MBTOWC_H
diff --git a/libc/src/wchar/wcstombs.cpp b/libc/src/stdlib/wcstombs.cpp
similarity index 97%
rename from libc/src/wchar/wcstombs.cpp
rename to libc/src/stdlib/wcstombs.cpp
index c3793cbe912cd..712af958456de 100644
--- a/libc/src/wchar/wcstombs.cpp
+++ b/libc/src/stdlib/wcstombs.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/wchar/wcstombs.h"
+#include "src/stdlib/wcstombs.h"
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/size_t.h"
diff --git a/libc/src/wchar/wcstombs.h b/libc/src/stdlib/wcstombs.h
similarity index 83%
rename from libc/src/wchar/wcstombs.h
rename to libc/src/stdlib/wcstombs.h
index cd0008a168d90..39515431098cb 100644
--- a/libc/src/wchar/wcstombs.h
+++ b/libc/src/stdlib/wcstombs.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOMBS_H
-#define LLVM_LIBC_SRC_WCHAR_WCSTOMBS_H
+#ifndef LLVM_LIBC_SRC_STDLIB_WCSTOMBS_H
+#define LLVM_LIBC_SRC_STDLIB_WCSTOMBS_H
 
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
@@ -19,4 +19,4 @@ size_t wcstombs(char *__restrict s, const wchar_t *__restrict pwcs, size_t n);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_WCHAR_WCSTOMBS_H
+#endif // LLVM_LIBC_SRC_STDLIB_WCSTOMBS_H
diff --git a/libc/src/wchar/wctomb.cpp b/libc/src/stdlib/wctomb.cpp
similarity index 97%
rename from libc/src/wchar/wctomb.cpp
rename to libc/src/stdlib/wctomb.cpp
index 142302e6ae09b..0ca1a84cd923e 100644
--- a/libc/src/wchar/wctomb.cpp
+++ b/libc/src/stdlib/wctomb.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/wchar/wctomb.h"
+#include "src/stdlib/wctomb.h"
 
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
diff --git a/libc/src/wchar/wctomb.h b/libc/src/stdlib/wctomb.h
similarity index 83%
rename from libc/src/wchar/wctomb.h
rename to libc/src/stdlib/wctomb.h
index 02a34e5ad229f..90afa31d9e707 100644
--- a/libc/src/wchar/wctomb.h
+++ b/libc/src/stdlib/wctomb.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_WCHAR_WCTOMB_H
-#define LLVM_LIBC_SRC_WCHAR_WCTOMB_H
+#ifndef LLVM_LIBC_SRC_STDLIB_WCTOMB_H
+#define LLVM_LIBC_SRC_STDLIB_WCTOMB_H
 
 #include "hdr/types/mbstate_t.h"
 #include "hdr/types/wchar_t.h"
@@ -19,4 +19,4 @@ int wctomb(char *s, wchar_t wc);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_WCHAR_WCTOMB_H
+#endif // LLVM_LIBC_SRC_STDLIB_WCTOMB_H
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index e6d9af9eacf73..9ca7295118a11 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -156,19 +156,6 @@ add_entrypoint_object(
     libc.src.__support.wchar.mbstate
 )
 
-add_entrypoint_object(
-  wctomb
-  SRCS
-    wctomb.cpp
-  HDRS
-    wctomb.h
-  DEPENDS
-    libc.hdr.types.wchar_t
-    libc.src.__support.wchar.wcrtomb
-    libc.src.__support.wchar.mbstate
-    libc.src.__support.libc_errno
-)
-
 add_entrypoint_object(
   mbsinit
   SRCS
@@ -201,39 +188,6 @@ add_entrypoint_object(
     libc.src.__support.wchar.mbstate
 )
 
-add_entrypoint_object(
-  mbtowc
-  SRCS
-    mbtowc.cpp
-  HDRS
-    mbtowc.h
-  DEPENDS
-    libc.hdr.types.size_t
-    libc.hdr.types.wchar_t
-    libc.src.__support.common
-    libc.src.__support.macros.config
-    libc.src.__support.libc_errno
-    libc.src.__support.wchar.mbrtowc
-    libc.src.__support.wchar.mbstate
-)
-
-add_entrypoint_object(
-  mbstowcs
-  SRCS
-    mbstowcs.cpp
-  HDRS
-    mbstowcs.h
-  DEPENDS
-    libc.hdr.types.size_t
-    libc.hdr.types.wchar_t
-    libc.src.__support.common
-    libc.src.__support.macros.config
-    libc.src.__support.macros.null_check
-    libc.src.__support.libc_errno
-    libc.src.__support.wchar.mbstate
-    libc.src.__support.wchar.mbsnrtowcs
-)
-
 add_entrypoint_object(
   mbsrtowcs
   SRCS
@@ -266,19 +220,6 @@ add_entrypoint_object(
     libc.src.__support.wchar.mbsnrtowcs
 )
 
-add_entrypoint_object(
-  wcstombs
-  SRCS
-    wcstombs.cpp
-  HDRS
-    wcstombs.h
-  DEPENDS
-    libc.hdr.types.wchar_t
-    libc.src.__support.wchar.mbstate
-    libc.src.__support.wchar.wcsnrtombs
-    libc.src.__support.libc_errno
-)
-
 add_entrypoint_object(
   wcsrtombs
   SRCS
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 42e8faa3fd69f..bcd3d139aa46c 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -388,6 +388,56 @@ add_libc_test(
     libc.src.stdlib.memalignment
 )
 
+add_libc_test(
+  mbtowc_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    mbtowc_test.cpp
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.src.stdlib.mbtowc
+    libc.hdr.types.wchar_t
+    libc.test.UnitTest.ErrnoCheckingTest
+)
+
+add_libc_test(
+  mbstowcs_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    mbstowcs_test.cpp
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.src.stdlib.mbstowcs
+    libc.hdr.types.wchar_t
+    libc.test.UnitTest.ErrnoCheckingTest
+)
+
+add_libc_test(
+  wctomb_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    wctomb_test.cpp
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.src.stdlib.wctomb
+    libc.hdr.types.wchar_t
+)
+
+add_libc_test(
+  wcstombs_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    wcstombs_test.cpp
+  DEPENDS
+    libc.src.stdlib.wcstombs
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.hdr.types.wchar_t
+)
+
 if(LLVM_LIBC_FULL_BUILD)
 
   add_libc_test(
diff --git a/libc/test/src/wchar/mbstowcs_test.cpp b/libc/test/src/stdlib/mbstowcs_test.cpp
similarity index 99%
rename from libc/test/src/wchar/mbstowcs_test.cpp
rename to libc/test/src/stdlib/mbstowcs_test.cpp
index 742f47819c84b..125683a3eca97 100644
--- a/libc/test/src/wchar/mbstowcs_test.cpp
+++ b/libc/test/src/stdlib/mbstowcs_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/errno_macros.h"
 #include "hdr/types/wchar_t.h"
 #include "src/__support/macros/null_check.h"
-#include "src/wchar/mbstowcs.h"
+#include "src/stdlib/mbstowcs.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/wchar/mbtowc_test.cpp b/libc/test/src/stdlib/mbtowc_test.cpp
similarity index 99%
rename from libc/test/src/wchar/mbtowc_test.cpp
rename to libc/test/src/stdlib/mbtowc_test.cpp
index 7c86d5583aaed..7946e077b647e 100644
--- a/libc/test/src/wchar/mbtowc_test.cpp
+++ b/libc/test/src/stdlib/mbtowc_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/errno_macros.h"
 #include "hdr/types/wchar_t.h"
-#include "src/wchar/mbtowc.h"
+#include "src/stdlib/mbtowc.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/wchar/wcstombs_test.cpp b/libc/test/src/stdlib/wcstombs_test.cpp
similarity index 98%
rename from libc/test/src/wchar/wcstombs_test.cpp
rename to libc/test/src/stdlib/wcstombs_test.cpp
index 61e0873dc9711..792a4edb51b38 100644
--- a/libc/test/src/wchar/wcstombs_test.cpp
+++ b/libc/test/src/stdlib/wcstombs_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/wchar/wcstombs.h"
+#include "src/stdlib/wcstombs.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/wchar/wctomb_test.cpp b/libc/test/src/stdlib/wctomb_test.cpp
similarity index 98%
rename from libc/test/src/wchar/wctomb_test.cpp
rename to libc/test/src/stdlib/wctomb_test.cpp
index 357f36267b689..56bebf87a2796 100644
--- a/libc/test/src/wchar/wctomb_test.cpp
+++ b/libc/test/src/stdlib/wctomb_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/errno_macros.h"
 #include "hdr/types/wchar_t.h"
-#include "src/wchar/wctomb.h"
+#include "src/stdlib/wctomb.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index a62a30fe00124..7a7cfaee7f367 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -62,32 +62,6 @@ add_libc_test(
     libc.test.UnitTest.ErrnoCheckingTest
 )
 
-add_libc_test(
-  mbtowc_test
-  SUITE
-    libc_wchar_unittests
-  SRCS
-    mbtowc_test.cpp
-  DEPENDS
-    libc.hdr.errno_macros
-    libc.src.wchar.mbtowc
-    libc.hdr.types.wchar_t
-    libc.test.UnitTest.ErrnoCheckingTest
-)
-
-add_libc_test(
-  mbstowcs_test
-  SUITE
-    libc_wchar_unittests
-  SRCS
-    mbstowcs_test.cpp
-  DEPENDS
-    libc.hdr.errno_macros
-    libc.src.wchar.mbstowcs
-    libc.hdr.types.wchar_t
-    libc.test.UnitTest.ErrnoCheckingTest
-)
-
 add_libc_test(
   mblen_test
   SUITE
@@ -188,30 +162,6 @@ add_libc_test(
     libc.test.UnitTest.ErrnoCheckingTest
 )
 
-add_libc_test(
-  wctomb_test
-  SUITE
-    libc_wchar_unittests
-  SRCS
-    wctomb_test.cpp
-  DEPENDS
-    libc.hdr.errno_macros
-    libc.src.wchar.wctomb
-    libc.hdr.types.wchar_t
-)
-
-add_libc_test(
-  wcstombs_test
-  SUITE
-    libc_wchar_unittests
-  SRCS
-    wcstombs_test.cpp
-  DEPENDS
-    libc.src.wchar.wcstombs
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.hdr.types.wchar_t
-)
-
 add_libc_test(
   wcsrtombs_test
   SUITE

From 865c92be033477d07832af13342f8129614c942d Mon Sep 17 00:00:00 2001
From: Fabrice de Gans <Steelskin@users.noreply.github.com>
Date: Mon, 17 Nov 2025 18:45:53 -0500
Subject: [PATCH 105/105] llvm: Export `ilist_node_base` template
 specialization (#168094)

The core LLVM library implements a specialization for
`ilist_node_base<true, void>`, which is used by other components. This
is needed to link properly when building LLVM as a library on Windows.

This effort is tracked in #109483.
---
 llvm/include/llvm/ADT/ilist_node_base.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/include/llvm/ADT/ilist_node_base.h b/llvm/include/llvm/ADT/ilist_node_base.h
index 49b197d3466d9..937e7d060e489 100644
--- a/llvm/include/llvm/ADT/ilist_node_base.h
+++ b/llvm/include/llvm/ADT/ilist_node_base.h
@@ -67,6 +67,9 @@ class ilist_node_base : public ilist_detail::node_base_prevnext<
                             EnableSentinelTracking>,
                         public ilist_detail::node_base_parent<ParentTy> {};
 
+// Specialization implemented in the core LLVM library.
+template class LLVM_ABI ilist_node_base<true, void>;
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_ILIST_NODE_BASE_H