diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index a6d6edbd547e7..ac39a47d43c07 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -370,6 +370,13 @@ def github_get_metrics(
             started_at = job.started_at
             completed_at = job.completed_at
 
+            if completed_at is None:
+                logging.info(
+                    f"Workflow {task.id} is marked completed but has a job without a "
+                    "completion timestamp."
+                )
+                continue
+
             # GitHub API can return results where the started_at is slightly
             # later then the created_at (or completed earlier than started).
             # This would cause a -23h59mn delta, which will show up as +24h
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index d923d2a90e908..ad448766e665f 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -615,7 +615,7 @@ def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
-def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX81>;
 def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
 def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
 def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index e3be262622844..c0ed799970122 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -28,6 +28,9 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\
 // RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx81 -DPTX=81 \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM80 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
 // RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
@@ -1025,6 +1028,10 @@ __device__ void nvvm_cvt_sm80() {
 
   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
   __nvvm_f2tf32_rna(1);
+  #if PTX >= 81
+  // CHECK_PTX81_SM80: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
+  __nvvm_f2tf32_rna_satfinite(1.0f);
+  #endif
 #endif
   // CHECK: ret void
 }
@@ -1058,9 +1065,6 @@ __device__ void nvvm_cvt_sm89() {
   __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
   __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
-
-  // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
-  __nvvm_f2tf32_rna_satfinite(1.0f);
 #endif
   // CHECK: ret void
 }
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 0f0f87915bafe..3f7dd8e402b78 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -113,9 +113,6 @@ def push_dynamic_library_lookup_path(config, new_path):
         config.environment[dynamic_library_lookup_var] = new_ld_library_path_64
 
 
-# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites.
-# See https://github.com/llvm/llvm-project/issues/106636 for more details.
-#
 # Choose between lit's internal shell pipeline runner and a real shell.  If
 # LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override.
 use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
@@ -123,8 +120,9 @@ def push_dynamic_library_lookup_path(config, new_path):
     # 0 is external, "" is default, and everything else is internal.
     execute_external = use_lit_shell == "0"
 else:
-    # Otherwise we default to internal everywhere.
-    execute_external = False
+    # Otherwise we default to internal on Windows and external elsewhere, as
+    # bash on Windows is usually very slow.
+    execute_external = not sys.platform in ["win32"]
 
 # Allow expanding substitutions that are based on other substitutions
 config.recursiveExpansionLimit = 10
diff --git a/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
index 78454e33f7356..d5524020f2636 100644
--- a/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
+++ b/compiler-rt/test/orc/TestCases/Darwin/arm64/objc-imageinfo.S
@@ -5,7 +5,9 @@
 
 // RUN: rm -rf %t
 // RUN: split-file %s %t
-// RUN: (cd %t; %clang -c *.S)
+// RUN: pushd %t
+// RUN: %clang -c *.S
+// RUN: popd
 
 // Check individual versions are loadable.
 
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 42571862b24b2..acfd4c8a14acb 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -945,6 +945,7 @@ if(LLVM_LIBC_FULL_BUILD)
     # arpa/inet.h entrypoints
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.htons
+    libc.src.arpa.inet.inet_addr
     libc.src.arpa.inet.inet_aton
     libc.src.arpa.inet.ntohl
     libc.src.arpa.inet.ntohs
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index b62a46b7178d5..83d5c42816632 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -1078,6 +1078,7 @@ if(LLVM_LIBC_FULL_BUILD)
     # arpa/inet.h entrypoints
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.htons
+    libc.src.arpa.inet.inet_addr
     libc.src.arpa.inet.inet_aton
     libc.src.arpa.inet.ntohl
     libc.src.arpa.inet.ntohs
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 910bdc53cbbc5..bbff4969cb413 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1119,6 +1119,7 @@ if(LLVM_LIBC_FULL_BUILD)
     # arpa/inet.h entrypoints
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.htons
+    libc.src.arpa.inet.inet_addr
     libc.src.arpa.inet.inet_aton
     libc.src.arpa.inet.ntohl
     libc.src.arpa.inet.ntohs
diff --git a/libc/include/arpa/inet.yaml b/libc/include/arpa/inet.yaml
index 6e0629072b6ef..350a4d74e5bec 100644
--- a/libc/include/arpa/inet.yaml
+++ b/libc/include/arpa/inet.yaml
@@ -3,6 +3,7 @@ header_template: inet.h.def
 macros: []
 types:
   - type_name: in_addr
+  - type_name: in_addr_t
 enums: []
 objects: []
 functions:
@@ -18,6 +19,12 @@ functions:
     return_type: uint16_t
     arguments:
       - type: uint16_t
+  - name: inet_addr
+    standards:
+      - POSIX
+    return_type: in_addr_t
+    arguments:
+      - type: const char *
   - name: inet_aton
     standards:
       - llvm_libc_ext
diff --git a/libc/include/llvm-libc-macros/netinet-in-macros.h b/libc/include/llvm-libc-macros/netinet-in-macros.h
index 2011c34e288cd..7a4d26d832114 100644
--- a/libc/include/llvm-libc-macros/netinet-in-macros.h
+++ b/libc/include/llvm-libc-macros/netinet-in-macros.h
@@ -29,6 +29,7 @@
 
 #define INADDR_ANY __LLVM_LIBC_CAST(static_cast, in_addr_t, 0x00000000)
 #define INADDR_BROADCAST __LLVM_LIBC_CAST(static_cast, in_addr_t, 0xffffffff)
+#define INADDR_NONE __LLVM_LIBC_CAST(static_cast, in_addr_t, 0xffffffff)
 
 #define INET_ADDRSTRLEN 16
 #define INET6_ADDRSTRLEN 46
diff --git a/libc/src/arpa/inet/CMakeLists.txt b/libc/src/arpa/inet/CMakeLists.txt
index bb43e24ec9d0b..3b3d0f43b8586 100644
--- a/libc/src/arpa/inet/CMakeLists.txt
+++ b/libc/src/arpa/inet/CMakeLists.txt
@@ -35,6 +35,21 @@ add_entrypoint_object(
     libc.src.__support.str_to_integer
 )
 
+add_entrypoint_object(
+  inet_addr
+  SRCS
+    inet_addr.cpp
+  HDRS
+    inet_addr.h
+  DEPENDS
+    libc.include.arpa_inet
+    libc.include.llvm-libc-macros.netinet_in_macros
+    libc.include.llvm-libc-types.in_addr
+    libc.include.llvm-libc-types.in_addr_t
+    libc.src.__support.common
+    libc.src.arpa.inet.inet_aton
+)
+
 add_entrypoint_object(
   ntohl
   SRCS
diff --git a/libc/src/arpa/inet/inet_addr.cpp b/libc/src/arpa/inet/inet_addr.cpp
new file mode 100644
index 0000000000000..8ce88c0df8aec
--- /dev/null
+++ b/libc/src/arpa/inet/inet_addr.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of inet_addr function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/arpa/inet/inet_addr.h"
+#include "include/llvm-libc-macros/netinet-in-macros.h"
+#include "include/llvm-libc-types/in_addr.h"
+#include "include/llvm-libc-types/in_addr_t.h"
+#include "src/__support/common.h"
+#include "src/arpa/inet/inet_aton.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(in_addr_t, inet_addr, (const char *cp)) {
+  in_addr addr;
+  return inet_aton(cp, &addr) ? addr.s_addr : INADDR_NONE;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/arpa/inet/inet_addr.h b/libc/src/arpa/inet/inet_addr.h
new file mode 100644
index 0000000000000..66f1ae80dd5a0
--- /dev/null
+++ b/libc/src/arpa/inet/inet_addr.h
@@ -0,0 +1,21 @@
+//===-- Implementation header of inet_addr ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_ARPA_INET_INET_ADDR_H
+#define LLVM_LIBC_SRC_ARPA_INET_INET_ADDR_H
+
+#include "include/llvm-libc-types/in_addr_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+in_addr_t inet_addr(const char *cp);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_ARPA_INET_INET_ADDR_H
diff --git a/libc/test/src/arpa/inet/CMakeLists.txt b/libc/test/src/arpa/inet/CMakeLists.txt
index 690f751bef5e1..1d400c4374f70 100644
--- a/libc/test/src/arpa/inet/CMakeLists.txt
+++ b/libc/test/src/arpa/inet/CMakeLists.txt
@@ -22,6 +22,17 @@ add_libc_unittest(
     libc.src.arpa.inet.ntohs
 )
 
+add_libc_unittest(
+  inet_addr
+  SUITE
+    libc_arpa_inet_unittests
+  SRCS
+    inet_addr_test.cpp
+  DEPENDS
+    libc.src.arpa.inet.htonl
+    libc.src.arpa.inet.inet_addr
+)
+
 add_libc_unittest(
   inet_aton
   SUITE
diff --git a/libc/test/src/arpa/inet/inet_addr_test.cpp b/libc/test/src/arpa/inet/inet_addr_test.cpp
new file mode 100644
index 0000000000000..3f0ea0fc3b3f3
--- /dev/null
+++ b/libc/test/src/arpa/inet/inet_addr_test.cpp
@@ -0,0 +1,25 @@
+//===-- Unittests for inet_addr -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/arpa/inet/htonl.h"
+#include "src/arpa/inet/inet_addr.h"
+#include "test/UnitTest/Test.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+TEST(LlvmLibcInetAddr, ValidTest) {
+  ASSERT_EQ(htonl(0x7f010204), inet_addr("127.1.2.4"));
+  ASSERT_EQ(htonl(0x7f010004), inet_addr("127.1.4"));
+}
+
+TEST(LlvmLibcInetAddr, InvalidTest) {
+  ASSERT_EQ(htonl(0xffffffff), inet_addr(""));
+  ASSERT_EQ(htonl(0xffffffff), inet_addr("x"));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libcxx/include/span b/libcxx/include/span
index 3d4f9e4ba7831..1911badd88cb1 100644
--- a/libcxx/include/span
+++ b/libcxx/include/span
@@ -310,30 +310,32 @@ public:
   }
 
   template <size_t _Count>
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> first() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> first() const noexcept {
     static_assert(_Count <= _Extent, "span<T, N>::first<Count>(): Count out of range");
     return span<element_type, _Count>{data(), _Count};
   }
 
   template <size_t _Count>
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> last() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> last() const noexcept {
     static_assert(_Count <= _Extent, "span<T, N>::last<Count>(): Count out of range");
     return span<element_type, _Count>{data() + size() - _Count, _Count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> first(size_type __count) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
+  first(size_type __count) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count <= size(), "span<T, N>::first(count): count out of range");
     return {data(), __count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> last(size_type __count) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
+  last(size_type __count) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count <= size(), "span<T, N>::last(count): count out of range");
     return {data() + size() - __count, __count};
   }
 
   template <size_t _Offset, size_t _Count = dynamic_extent>
-  _LIBCPP_HIDE_FROM_ABI constexpr auto
-  subspan() const noexcept -> span<element_type, _Count != dynamic_extent ? _Count : _Extent - _Offset> {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto subspan() const noexcept
+      -> span<element_type, _Count != dynamic_extent ? _Count : _Extent - _Offset> {
     static_assert(_Offset <= _Extent, "span<T, N>::subspan<Offset, Count>(): Offset out of range");
     static_assert(_Count == dynamic_extent || _Count <= _Extent - _Offset,
                   "span<T, N>::subspan<Offset, Count>(): Offset + Count out of range");
@@ -342,7 +344,7 @@ public:
     return _ReturnType{data() + _Offset, _Count == dynamic_extent ? size() - _Offset : _Count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
   subspan(size_type __offset, size_type __count = dynamic_extent) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__offset <= size(), "span<T, N>::subspan(offset, count): offset out of range");
     if (__count == dynamic_extent)
@@ -352,52 +354,58 @@ public:
     return {data() + __offset, __count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr size_type size() const noexcept { return _Extent; }
-  _LIBCPP_HIDE_FROM_ABI constexpr size_type size_bytes() const noexcept { return _Extent * sizeof(element_type); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr size_type size() const noexcept { return _Extent; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr size_type size_bytes() const noexcept {
+    return _Extent * sizeof(element_type);
+  }
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty() const noexcept { return _Extent == 0; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](size_type __idx) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](size_type __idx) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__idx < size(), "span<T, N>::operator[](index): index out of range");
     return __data_[__idx];
   }
 
 #    if _LIBCPP_STD_VER >= 26
-  _LIBCPP_HIDE_FROM_ABI constexpr reference at(size_type __index) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference at(size_type __index) const {
     if (__index >= size())
       std::__throw_out_of_range("span");
     return __data_[__index];
   }
 #    endif
 
-  _LIBCPP_HIDE_FROM_ABI constexpr reference front() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference front() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "span<T, N>::front() on empty span");
     return __data_[0];
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr reference back() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference back() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "span<T, N>::back() on empty span");
     return __data_[size() - 1];
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr pointer data() const noexcept { return __data_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr pointer data() const noexcept { return __data_; }
 
   // [span.iter], span iterator support
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() const noexcept {
 #    ifdef _LIBCPP_ABI_BOUNDED_ITERATORS
     return std::__make_bounded_iter(data(), data(), data() + size());
 #    else
     return iterator(data());
 #    endif
   }
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator end() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iterator end() const noexcept {
 #    ifdef _LIBCPP_ABI_BOUNDED_ITERATORS
     return std::__make_bounded_iter(data() + size(), data(), data() + size());
 #    else
     return iterator(data() + size());
 #    endif
   }
-  _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rbegin() const noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rend() const noexcept { return reverse_iterator(begin()); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rbegin() const noexcept {
+    return reverse_iterator(end());
+  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rend() const noexcept {
+    return reverse_iterator(begin());
+  }
 
   _LIBCPP_HIDE_FROM_ABI span<const byte, _Extent * sizeof(element_type)> __as_bytes() const noexcept {
     return span<const byte, _Extent * sizeof(element_type)>{reinterpret_cast<const byte*>(data()), size_bytes()};
@@ -478,36 +486,38 @@ public:
       : __data_{__other.data()}, __size_{__other.size()} {}
 
   template <size_t _Count>
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> first() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> first() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(_Count <= size(), "span<T>::first<Count>(): Count out of range");
     return span<element_type, _Count>{data(), _Count};
   }
 
   template <size_t _Count>
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> last() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> last() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(_Count <= size(), "span<T>::last<Count>(): Count out of range");
     return span<element_type, _Count>{data() + size() - _Count, _Count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> first(size_type __count) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
+  first(size_type __count) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count <= size(), "span<T>::first(count): count out of range");
     return {data(), __count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent> last(size_type __count) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, dynamic_extent>
+  last(size_type __count) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count <= size(), "span<T>::last(count): count out of range");
     return {data() + size() - __count, __count};
   }
 
   template <size_t _Offset, size_t _Count = dynamic_extent>
-  _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> subspan() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr span<element_type, _Count> subspan() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(_Offset <= size(), "span<T>::subspan<Offset, Count>(): Offset out of range");
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(_Count == dynamic_extent || _Count <= size() - _Offset,
                                         "span<T>::subspan<Offset, Count>(): Offset + Count out of range");
     return span<element_type, _Count>{data() + _Offset, _Count == dynamic_extent ? size() - _Offset : _Count};
   }
 
-  constexpr span<element_type, dynamic_extent> _LIBCPP_HIDE_FROM_ABI
+  [[nodiscard]] constexpr span<element_type, dynamic_extent> _LIBCPP_HIDE_FROM_ABI
   subspan(size_type __offset, size_type __count = dynamic_extent) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__offset <= size(), "span<T>::subspan(offset, count): offset out of range");
     if (__count == dynamic_extent)
@@ -517,52 +527,58 @@ public:
     return {data() + __offset, __count};
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr size_type size() const noexcept { return __size_; }
-  _LIBCPP_HIDE_FROM_ABI constexpr size_type size_bytes() const noexcept { return __size_ * sizeof(element_type); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr size_type size() const noexcept { return __size_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr size_type size_bytes() const noexcept {
+    return __size_ * sizeof(element_type);
+  }
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty() const noexcept { return __size_ == 0; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](size_type __idx) const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference operator[](size_type __idx) const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__idx < size(), "span<T>::operator[](index): index out of range");
     return __data_[__idx];
   }
 
 #    if _LIBCPP_STD_VER >= 26
-  _LIBCPP_HIDE_FROM_ABI constexpr reference at(size_type __index) const {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference at(size_type __index) const {
     if (__index >= size())
       std::__throw_out_of_range("span");
     return __data_[__index];
   }
 #    endif
 
-  _LIBCPP_HIDE_FROM_ABI constexpr reference front() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference front() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "span<T>::front() on empty span");
     return __data_[0];
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr reference back() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reference back() const noexcept {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "span<T>::back() on empty span");
     return __data_[size() - 1];
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr pointer data() const noexcept { return __data_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr pointer data() const noexcept { return __data_; }
 
   // [span.iter], span iterator support
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() const noexcept {
 #    ifdef _LIBCPP_ABI_BOUNDED_ITERATORS
     return std::__make_bounded_iter(data(), data(), data() + size());
 #    else
     return iterator(data());
 #    endif
   }
-  _LIBCPP_HIDE_FROM_ABI constexpr iterator end() const noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr iterator end() const noexcept {
 #    ifdef _LIBCPP_ABI_BOUNDED_ITERATORS
     return std::__make_bounded_iter(data() + size(), data(), data() + size());
 #    else
     return iterator(data() + size());
 #    endif
   }
-  _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rbegin() const noexcept { return reverse_iterator(end()); }
-  _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rend() const noexcept { return reverse_iterator(begin()); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rbegin() const noexcept {
+    return reverse_iterator(end());
+  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr reverse_iterator rend() const noexcept {
+    return reverse_iterator(begin());
+  }
 
   _LIBCPP_HIDE_FROM_ABI span<const byte, dynamic_extent> __as_bytes() const noexcept {
     return {reinterpret_cast<const byte*>(data()), size_bytes()};
@@ -585,13 +601,13 @@ inline constexpr bool ranges::enable_view<span<_ElementType, _Extent>> = true;
 
 //  as_bytes & as_writable_bytes
 template <class _Tp, size_t _Extent>
-_LIBCPP_HIDE_FROM_ABI auto as_bytes(span<_Tp, _Extent> __s) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto as_bytes(span<_Tp, _Extent> __s) noexcept {
   return __s.__as_bytes();
 }
 
 template <class _Tp, size_t _Extent>
   requires(!is_const_v<_Tp>)
-_LIBCPP_HIDE_FROM_ABI auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto as_writable_bytes(span<_Tp, _Extent> __s) noexcept {
   return __s.__as_writable_bytes();
 }
 
diff --git a/libcxx/test/libcxx/containers/views/views.span/nodiscard.verify.cpp b/libcxx/test/libcxx/containers/views/views.span/nodiscard.verify.cpp
new file mode 100644
index 0000000000000..666680597f147
--- /dev/null
+++ b/libcxx/test/libcxx/containers/views/views.span/nodiscard.verify.cpp
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++20
+
+// <span>
+
+// Check that functions are marked [[nodiscard]]
+
+#include <span>
+
+#include "test_macros.h"
+
+void test() {
+  { // Test with a static extent
+    std::span<int, 0> sp;
+
+    sp.first<0>();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.last<0>();       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.first(0);        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.last(0);         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.subspan<0, 0>(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.subspan(0, 0);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.size();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.size_bytes();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.empty();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp[0];              // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#if TEST_STD_VER >= 26
+    sp.at(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+    sp.front();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.back();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.data();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.end();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.rend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+    std::as_bytes(sp); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+    std::as_writable_bytes(sp);
+  }
+  { // Test with a dynamic extent
+    std::span<int> sp;
+
+    sp.first<0>();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.last<0>();       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.first(0);        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.last(0);         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.subspan<0, 0>(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.subspan(0, 0);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.size();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.size_bytes();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.empty();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp[0];              // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#if TEST_STD_VER >= 26
+    sp.at(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+    sp.front();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.back();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.data();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.end();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    sp.rend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+    std::as_bytes(sp); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+    std::as_writable_bytes(sp);
+  }
+}
diff --git a/lldb/bindings/lua/lua-typemaps.swig b/lldb/bindings/lua/lua-typemaps.swig
index f2a7401419368..a4a77b1a214c9 100644
--- a/lldb/bindings/lua/lua-typemaps.swig
+++ b/lldb/bindings/lua/lua-typemaps.swig
@@ -122,9 +122,9 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE);
 }
 
 // Disable default type checking for this method to avoid SWIG dispatch issues.
-// 
+//
 // Problem: SBThread::GetStopDescription has two overloads:
-//   1. GetStopDescription(char* dst_or_null, size_t dst_len) 
+//   1. GetStopDescription(char* dst_or_null, size_t dst_len)
 //   2. GetStopDescription(lldb::SBStream& stream)
 //
 // SWIG generates a dispatch function to select the correct overload based on argument types.
@@ -132,9 +132,9 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE);
 // However, this dispatcher doesn't consider typemaps that transform function signatures.
 //
 // In lua, our typemap converts GetStopDescription(char*, size_t) to GetStopDescription(int).
-// The dispatcher still checks against the original (char*, size_t) signature instead of 
+// The dispatcher still checks against the original (char*, size_t) signature instead of
 // the transformed (int) signature, causing type matching to fail.
-// This only affects SBThread::GetStopDescription since the type check also matches 
+// This only affects SBThread::GetStopDescription since the type check also matches
 // the argument name, which is unique to this function.
 %typemap(typecheck, precedence=SWIG_TYPECHECK_POINTER) (char *dst_or_null, size_t dst_len) ""
 
@@ -251,7 +251,8 @@ LLDB_NUMBER_TYPEMAP(enum SWIGTYPE);
 %typemap(in) lldb::FileSP {
   luaL_Stream *p = (luaL_Stream *)luaL_checkudata(L, $input, LUA_FILEHANDLE);
   lldb::FileSP file_sp;
-  file_sp = std::make_shared<lldb_private::NativeFile>(p->f, false);
+  file_sp = std::make_shared<lldb_private::NativeFile>(
+      p->f, lldb_private::NativeFile::eOpenOptionReadWrite, false);
   if (!file_sp->IsValid())
     return luaL_error(L, "Invalid file");
   $1 = file_sp;
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index bd2884481ee95..c28d6ce792184 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5896,7 +5896,7 @@ The fields used by CP for code objects before V3 also match those specified in
                                                      GFX950
                                                        roundup(lds-size / (320 * 4))
                                                      GFX125*
-                                                       roundup(lds-size / (256 * 4))
+                                                       roundup(lds-size / (512 * 4))
 
      24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
                      _INVALID_OPERATION              with specified exceptions
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 438a84d39ed66..7e95545425f2d 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -1200,3 +1200,14 @@ Windows
   When compiling with clang-cl, CMake may use ``llvm-mt`` as the Manifest Tool
   when available. ```llvm-mt``` is only present when libxml2 is found at build-time.
   To ensure using Microsoft's Manifest Tool set `CMAKE_MT=mt`.
+
+Apple/OSX
+---------
+
+**CMAKE_OSX_SYSROOT**:STRING
+  When compiling for OSX, in order for the test suite to find libSystem to link
+  dylib tests you'll need to run CMake with ```xcrun --show-sdk-path``` as the
+  string to pass in so that the testsuite can find your os libraries.
+
+  This will show up as ```ld: library not found for -lSystem``` when running
+  tests.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/GetDylibInterface.h b/llvm/include/llvm/ExecutionEngine/Orc/GetDylibInterface.h
deleted file mode 100644
index 077d88d1758a0..0000000000000
--- a/llvm/include/llvm/ExecutionEngine/Orc/GetDylibInterface.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===---- GetDylibInterface.h - Get interface for real dylib ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Get symbol interface from a real dynamic library or TAPI file. These
-// interfaces can be used to simulate weak linking (ld64 -weak-lx /
-// -weak_library) against a library that is absent at runtime.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORC_GETDYLIBINTERFACE_H
-#define LLVM_EXECUTIONENGINE_ORC_GETDYLIBINTERFACE_H
-
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm::orc {
-
-/// Returns a SymbolNameSet containing the exported symbols defined in the
-/// given dylib.
-LLVM_ABI Expected<SymbolNameSet>
-getDylibInterfaceFromDylib(ExecutionSession &ES, Twine Path);
-
-/// Returns a SymbolNameSet containing the exported symbols defined in the
-/// relevant slice of the TapiUniversal file.
-LLVM_ABI Expected<SymbolNameSet>
-getDylibInterfaceFromTapiFile(ExecutionSession &ES, Twine Path);
-
-/// Returns a SymbolNameSet containing the exported symbols defined in the
-/// relevant slice of the given file, which may be either a dylib or a tapi
-/// file.
-LLVM_ABI Expected<SymbolNameSet> getDylibInterface(ExecutionSession &ES,
-                                                   Twine Path);
-
-} // namespace llvm::orc
-
-#endif // LLVM_EXECUTIONENGINE_ORC_GETDYLIBINTERFACE_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h
index a0342d8d75bc2..0e789b5e05a75 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_MACHO_H
 #define LLVM_EXECUTIONENGINE_ORC_MACHO_H
 
+#include "llvm/ExecutionEngine/Orc/CoreContainers.h"
 #include "llvm/ExecutionEngine/Orc/LoadLinkableFile.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Support/Compiler.h"
@@ -31,6 +32,7 @@ class MachOUniversalBinary;
 
 namespace orc {
 
+class ExecutionSession;
 class JITDylib;
 class ObjectLayer;
 
@@ -93,6 +95,37 @@ class ForceLoadMachOArchiveMembers {
   bool ObjCOnly;
 };
 
+using GetFallbackArchsFn =
+    unique_function<SmallVector<std::pair<uint32_t, uint32_t>>(
+        uint32_t CPUType, uint32_t CPUSubType)>;
+
+/// Match the exact CPU type/subtype only.
+LLVM_ABI SmallVector<std::pair<uint32_t, uint32_t>>
+noFallbackArchs(uint32_t CPUType, uint32_t CPUSubType);
+
+/// Match standard dynamic loader fallback rules.
+LLVM_ABI SmallVector<std::pair<uint32_t, uint32_t>>
+standardMachOFallbackArchs(uint32_t CPUType, uint32_t CPUSubType);
+
+/// Returns a SymbolNameSet containing the exported symbols defined in the
+/// given dylib.
+LLVM_ABI Expected<SymbolNameSet> getDylibInterfaceFromDylib(
+    ExecutionSession &ES, Twine Path,
+    GetFallbackArchsFn GetFallbackArchs = standardMachOFallbackArchs);
+
+/// Returns a SymbolNameSet containing the exported symbols defined in the
+/// relevant slice of the TapiUniversal file.
+LLVM_ABI Expected<SymbolNameSet> getDylibInterfaceFromTapiFile(
+    ExecutionSession &ES, Twine Path,
+    GetFallbackArchsFn GetFallbackArchs = standardMachOFallbackArchs);
+
+/// Returns a SymbolNameSet containing the exported symbols defined in the
+/// relevant slice of the given file, which may be either a dylib or a tapi
+/// file.
+LLVM_ABI Expected<SymbolNameSet> getDylibInterface(
+    ExecutionSession &ES, Twine Path,
+    GetFallbackArchsFn GetFallbackArchs = standardMachOFallbackArchs);
+
 } // namespace orc
 } // namespace llvm
 
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index f34392538a7cb..db16a3005f6c1 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,7 +26,6 @@ add_llvm_component_library(LLVMOrcJIT
   ExecutionUtils.cpp
   ExecutorResolutionGenerator.cpp
   ObjectFileInterface.cpp
-  GetDylibInterface.cpp
   IndirectionUtils.cpp
   InProcessMemoryAccess.cpp
   IRCompileLayer.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp b/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
deleted file mode 100644
index 9ccb211931a5b..0000000000000
--- a/llvm/lib/ExecutionEngine/Orc/GetDylibInterface.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//===-------- GetDylibInterface.cpp - Get interface for real dylib --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/Orc/GetDylibInterface.h"
-
-#include "llvm/BinaryFormat/Magic.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/MachOUniversal.h"
-#include "llvm/Object/TapiUniversal.h"
-
-#define DEBUG_TYPE "orc"
-
-namespace llvm::orc {
-
-Expected<SymbolNameSet> getDylibInterfaceFromDylib(ExecutionSession &ES,
-                                                   Twine Path) {
-  auto CPUType = MachO::getCPUType(ES.getTargetTriple());
-  if (!CPUType)
-    return CPUType.takeError();
-
-  auto CPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
-  if (!CPUSubType)
-    return CPUSubType.takeError();
-
-  auto Buf = MemoryBuffer::getFile(Path);
-  if (!Buf)
-    return createFileError(Path, Buf.getError());
-
-  auto BinFile = object::createBinary((*Buf)->getMemBufferRef());
-  if (!BinFile)
-    return BinFile.takeError();
-
-  std::unique_ptr<object::MachOObjectFile> MachOFile;
-  if (isa<object::MachOObjectFile>(**BinFile))
-    MachOFile.reset(dyn_cast<object::MachOObjectFile>(BinFile->release()));
-  else if (auto *MachOUni =
-               dyn_cast<object::MachOUniversalBinary>(BinFile->get())) {
-    for (auto &O : MachOUni->objects()) {
-      if (O.getCPUType() == *CPUType &&
-          (O.getCPUSubType() & ~MachO::CPU_SUBTYPE_MASK) == *CPUSubType) {
-        if (auto Obj = O.getAsObjectFile())
-          MachOFile = std::move(*Obj);
-        else
-          return Obj.takeError();
-        break;
-      }
-    }
-    if (!MachOFile)
-      return make_error<StringError>("MachO universal binary at " + Path +
-                                         " does not contain a slice for " +
-                                         ES.getTargetTriple().str(),
-                                     inconvertibleErrorCode());
-  } else
-    return make_error<StringError>("File at " + Path + " is not a MachO",
-                                   inconvertibleErrorCode());
-
-  if (MachOFile->getHeader().filetype != MachO::MH_DYLIB)
-    return make_error<StringError>("MachO at " + Path + " is not a dylib",
-                                   inconvertibleErrorCode());
-
-  SymbolNameSet Symbols;
-  for (auto &Sym : MachOFile->symbols()) {
-    if (auto Name = Sym.getName())
-      Symbols.insert(ES.intern(*Name));
-    else
-      return Name.takeError();
-  }
-
-  return std::move(Symbols);
-}
-
-Expected<SymbolNameSet> getDylibInterfaceFromTapiFile(ExecutionSession &ES,
-                                                      Twine Path) {
-  SymbolNameSet Symbols;
-
-  auto TapiFileBuffer = MemoryBuffer::getFile(Path);
-  if (!TapiFileBuffer)
-    return createFileError(Path, TapiFileBuffer.getError());
-
-  auto Tapi =
-      object::TapiUniversal::create((*TapiFileBuffer)->getMemBufferRef());
-  if (!Tapi)
-    return Tapi.takeError();
-
-  auto CPUType = MachO::getCPUType(ES.getTargetTriple());
-  if (!CPUType)
-    return CPUType.takeError();
-
-  auto CPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
-  if (!CPUSubType)
-    return CPUSubType.takeError();
-
-  auto &IF = (*Tapi)->getInterfaceFile();
-  auto Interface =
-      IF.extract(MachO::getArchitectureFromCpuType(*CPUType, *CPUSubType));
-  if (!Interface)
-    return Interface.takeError();
-
-  for (auto *Sym : (*Interface)->exports())
-    Symbols.insert(ES.intern(Sym->getName()));
-
-  return Symbols;
-}
-
-Expected<SymbolNameSet> getDylibInterface(ExecutionSession &ES, Twine Path) {
-  file_magic Magic;
-  if (auto EC = identify_magic(Path, Magic))
-    return createFileError(Path, EC);
-
-  switch (Magic) {
-  case file_magic::macho_universal_binary:
-  case file_magic::macho_dynamically_linked_shared_lib:
-    return getDylibInterfaceFromDylib(ES, Path);
-  case file_magic::tapi_file:
-    return getDylibInterfaceFromTapiFile(ES, Path);
-  default:
-    return make_error<StringError>("Cannot get interface for " + Path +
-                                       " unrecognized file type",
-                                   inconvertibleErrorCode());
-  }
-}
-
-} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp
index 89721d16930c0..731d24d1272d4 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp
@@ -10,9 +10,11 @@
 
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/Layer.h"
 #include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/TapiUniversal.h"
 #include "llvm/Support/FileSystem.h"
 
 #define DEBUG_TYPE "orc"
@@ -280,5 +282,177 @@ Expected<bool> ForceLoadMachOArchiveMembers::operator()(
   return true;
 }
 
+LLVM_ABI SmallVector<std::pair<uint32_t, uint32_t>>
+noFallbackArchs(uint32_t CPUType, uint32_t CPUSubType) {
+  SmallVector<std::pair<uint32_t, uint32_t>> Result;
+  Result.push_back({CPUType, CPUSubType});
+  return Result;
+}
+
+SmallVector<std::pair<uint32_t, uint32_t>>
+standardMachOFallbackArchs(uint32_t CPUType, uint32_t CPUSubType) {
+  SmallVector<std::pair<uint32_t, uint32_t>> Archs;
+
+  // Match given CPU type/subtype first.
+  Archs.push_back({CPUType, CPUSubType});
+
+  switch (CPUType) {
+  case MachO::CPU_TYPE_ARM64:
+    // Handle arm64 variants.
+    switch (CPUSubType) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      Archs.push_back({CPUType, MachO::CPU_SUBTYPE_ARM64E});
+      break;
+    default:
+      break;
+    }
+    break;
+  default:
+    break;
+  }
+
+  return Archs;
+}
+
+Expected<SymbolNameSet>
+getDylibInterfaceFromDylib(ExecutionSession &ES, Twine Path,
+                           GetFallbackArchsFn GetFallbackArchs) {
+  auto InitCPUType = MachO::getCPUType(ES.getTargetTriple());
+  if (!InitCPUType)
+    return InitCPUType.takeError();
+
+  auto InitCPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
+  if (!InitCPUSubType)
+    return InitCPUSubType.takeError();
+
+  auto Buf = MemoryBuffer::getFile(Path);
+  if (!Buf)
+    return createFileError(Path, Buf.getError());
+
+  auto BinFile = object::createBinary((*Buf)->getMemBufferRef());
+  if (!BinFile)
+    return BinFile.takeError();
+
+  std::unique_ptr<object::MachOObjectFile> MachOFile;
+  if (isa<object::MachOObjectFile>(**BinFile)) {
+    MachOFile.reset(dyn_cast<object::MachOObjectFile>(BinFile->release()));
+
+    // TODO: Check that dylib arch is compatible.
+  } else if (auto *MachOUni =
+                 dyn_cast<object::MachOUniversalBinary>(BinFile->get())) {
+    SmallVector<std::pair<uint32_t, uint32_t>> ArchsToTry;
+    if (GetFallbackArchs)
+      ArchsToTry = GetFallbackArchs(*InitCPUType, *InitCPUSubType);
+    else
+      ArchsToTry.push_back({*InitCPUType, *InitCPUSubType});
+
+    for (auto &[CPUType, CPUSubType] : ArchsToTry) {
+      for (auto &O : MachOUni->objects()) {
+        if (O.getCPUType() == CPUType &&
+            (O.getCPUSubType() & ~MachO::CPU_SUBTYPE_MASK) == CPUSubType) {
+          if (auto Obj = O.getAsObjectFile())
+            MachOFile = std::move(*Obj);
+          else
+            return Obj.takeError();
+          break;
+        }
+      }
+      if (MachOFile) // If found, break out.
+        break;
+    }
+    if (!MachOFile)
+      return make_error<StringError>(
+          "MachO universal binary at " + Path +
+              " does not contain a compatible slice for " +
+              ES.getTargetTriple().str(),
+          inconvertibleErrorCode());
+  } else
+    return make_error<StringError>("File at " + Path + " is not a MachO",
+                                   inconvertibleErrorCode());
+
+  if (MachOFile->getHeader().filetype != MachO::MH_DYLIB)
+    return make_error<StringError>("MachO at " + Path + " is not a dylib",
+                                   inconvertibleErrorCode());
+
+  SymbolNameSet Symbols;
+  for (auto &Sym : MachOFile->symbols()) {
+    if (auto Name = Sym.getName())
+      Symbols.insert(ES.intern(*Name));
+    else
+      return Name.takeError();
+  }
+
+  return std::move(Symbols);
+}
+
+Expected<SymbolNameSet>
+getDylibInterfaceFromTapiFile(ExecutionSession &ES, Twine Path,
+                              GetFallbackArchsFn GetFallbackArchs) {
+  SymbolNameSet Symbols;
+
+  auto TapiFileBuffer = MemoryBuffer::getFile(Path);
+  if (!TapiFileBuffer)
+    return createFileError(Path, TapiFileBuffer.getError());
+
+  auto Tapi =
+      object::TapiUniversal::create((*TapiFileBuffer)->getMemBufferRef());
+  if (!Tapi)
+    return Tapi.takeError();
+
+  auto InitCPUType = MachO::getCPUType(ES.getTargetTriple());
+  if (!InitCPUType)
+    return InitCPUType.takeError();
+
+  auto InitCPUSubType = MachO::getCPUSubType(ES.getTargetTriple());
+  if (!InitCPUSubType)
+    return InitCPUSubType.takeError();
+
+  SmallVector<std::pair<uint32_t, uint32_t>> ArchsToTry;
+  if (GetFallbackArchs)
+    ArchsToTry = GetFallbackArchs(*InitCPUType, *InitCPUSubType);
+  else
+    ArchsToTry.push_back({*InitCPUType, *InitCPUSubType});
+
+  auto &IF = (*Tapi)->getInterfaceFile();
+
+  auto ArchSet = IF.getArchitectures();
+  for (auto [CPUType, CPUSubType] : ArchsToTry) {
+    auto A = MachO::getArchitectureFromCpuType(CPUType, CPUSubType);
+    if (ArchSet.has(A)) {
+      if (auto Interface = IF.extract(A)) {
+        for (auto *Sym : (*Interface)->exports())
+          Symbols.insert(ES.intern(Sym->getName()));
+        return Symbols;
+      } else
+        return Interface.takeError();
+    }
+  }
+
+  return make_error<StringError>(
+      "MachO interface file at " + Path +
+          " does not contain a compatible slice for " +
+          ES.getTargetTriple().str(),
+      inconvertibleErrorCode());
+}
+
+Expected<SymbolNameSet> getDylibInterface(ExecutionSession &ES, Twine Path,
+                                          GetFallbackArchsFn GetFallbackArchs) {
+  file_magic Magic;
+  if (auto EC = identify_magic(Path, Magic))
+    return createFileError(Path, EC);
+
+  switch (Magic) {
+  case file_magic::macho_universal_binary:
+  case file_magic::macho_dynamically_linked_shared_lib:
+    return getDylibInterfaceFromDylib(ES, Path, std::move(GetFallbackArchs));
+  case file_magic::tapi_file:
+    return getDylibInterfaceFromTapiFile(ES, Path, std::move(GetFallbackArchs));
+  default:
+    return make_error<StringError>("Cannot get interface for " + Path +
+                                       " unrecognized file type",
+                                   inconvertibleErrorCode());
+  }
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 97298f9d74171..d0c4b1b9f83fd 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -662,12 +662,15 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes());
   CallInst *GuardCheck = B.CreateCall(
       GuardFnType, GuardCheckLoad, {F, Thunk});
+  Value *GuardCheckDest = B.CreateExtractValue(GuardCheck, 0);
+  Value *GuardFinalDest = B.CreateExtractValue(GuardCheck, 1);
 
   // Ensure that the first argument is passed in the correct register.
   GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
 
   SmallVector<Value *> Args(llvm::make_pointer_range(GuestExit->args()));
-  CallInst *Call = B.CreateCall(Arm64Ty, GuardCheck, Args);
+  OperandBundleDef OB("cfguardtarget", GuardFinalDest);
+  CallInst *Call = B.CreateCall(Arm64Ty, GuardCheckDest, Args, OB);
   Call->setTailCallKind(llvm::CallInst::TCK_MustTail);
 
   if (Call->getType()->isVoidTy())
@@ -767,11 +770,21 @@ void AArch64Arm64ECCallLowering::lowerCall(CallBase *CB) {
   CallInst *GuardCheck =
       B.CreateCall(GuardFnType, GuardCheckLoad, {CalledOperand, Thunk},
                    Bundles);
+  Value *GuardCheckDest = B.CreateExtractValue(GuardCheck, 0);
+  Value *GuardFinalDest = B.CreateExtractValue(GuardCheck, 1);
 
   // Ensure that the first argument is passed in the correct register.
   GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
 
-  CB->setCalledOperand(GuardCheck);
+  // Update the call: set the callee, and add a bundle with the final
+  // destination,
+  CB->setCalledOperand(GuardCheckDest);
+  OperandBundleDef OB("cfguardtarget", GuardFinalDest);
+  auto *NewCall = CallBase::addOperandBundle(CB, LLVMContext::OB_cfguardtarget,
+                                             OB, CB->getIterator());
+  NewCall->copyMetadata(*CB);
+  CB->replaceAllUsesWith(NewCall);
+  CB->eraseFromParent();
 }
 
 bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
@@ -789,7 +802,8 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
   I64Ty = Type::getInt64Ty(M->getContext());
   VoidTy = Type::getVoidTy(M->getContext());
 
-  GuardFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy}, false);
+  GuardFnType =
+      FunctionType::get(StructType::get(PtrTy, PtrTy), {PtrTy, PtrTy}, false);
   DispatchFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy, PtrTy}, false);
   GuardFnCFGlobal = M->getOrInsertGlobal("__os_arm64x_check_icall_cfg", PtrTy);
   GuardFnGlobal = M->getOrInsertGlobal("__os_arm64x_check_icall", PtrTy);
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 34c85d588f9c4..e2a79a49a9e92 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -162,7 +162,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[
 ]>;
 
 let Entry = 1 in
-def CC_AArch64_Win64PCS : CallingConv<AArch64_Common>;
+def CC_AArch64_Win64PCS : CallingConv<!listconcat(
+  [
+    // 'CFGuardTarget' is used for Arm64EC; it passes its parameter in X9.
+    CCIfCFGuardTarget<CCAssignToReg<[X9]>>
+  ],
+  AArch64_Common)
+>;
 
 // Vararg functions on windows pass floats in integer registers
 let Entry = 1 in
@@ -177,6 +183,9 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
 // a stack layout compatible with the x64 calling convention.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_VarArg : CallingConv<[
+  // 'CFGuardTarget' is used for Arm64EC; it passes its parameter in X9.
+  CCIfCFGuardTarget<CCAssignToReg<[X9]>>,
+
   CCIfNest<CCAssignToReg<[X15]>>,
 
   // Convert small floating-point values to integer.
@@ -345,7 +354,7 @@ def CC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
 
 let Entry = 1 in
 def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
-  CCIfType<[i64], CCAssignToReg<[X11]>>
+  CCIfType<[i64], CCAssignToReg<[X11, X9]>>
 ]>;
 
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 8bfdbb7c5c310..33cef795b111d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1186,21 +1186,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // Make clamp modifier on NaN input returns 0.
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
-  unsigned LDSAlignShift;
-  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
-    // LDS is allocated in 256 dword blocks.
-    LDSAlignShift = 10;
-  } else if (STM.getFeatureBits().test(
-                 FeatureAddressableLocalMemorySize163840)) {
-    // LDS is allocated in 320 dword blocks.
+  unsigned LDSAlignShift = 8;
+  switch (getLdsDwGranularity(STM)) {
+  case 512:
+  case 320:
     LDSAlignShift = 11;
-  } else if (STM.getFeatureBits().test(
-                 FeatureAddressableLocalMemorySize65536)) {
-    // LDS is allocated in 128 dword blocks.
+    break;
+  case 128:
     LDSAlignShift = 9;
-  } else {
-    // LDS is allocated in 64 dword blocks.
+    break;
+  case 64:
     LDSAlignShift = 8;
+    break;
+  default:
+    llvm_unreachable("invald LDS block size");
   }
 
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index aff4cfe1dc70e..998a2b5b36a87 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -3565,8 +3565,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
 }
 
 unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
-  return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
-                                                                        : 128;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768))
+    return 64;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
+    return 128;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+    return 320;
+  if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
+    return 512;
+  return 64; // In sync with getAddressableLocalMemorySize
 }
 
 bool isPackedFP32Inst(unsigned Opc) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f0bdf472b96ed..ff9d9723dddea 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -683,7 +683,7 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rn_relu  : CVT_TO_TF32<"rn.relu">;
   defm CVT_to_tf32_rz_relu  : CVT_TO_TF32<"rz.relu">;
   defm CVT_to_tf32_rna      : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
-  defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
+  defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<80>]>;
 
   defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
   defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 10588b9739188..75ce1b144a2e7 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -222,7 +222,6 @@ class RISCVAsmParser : public MCTargetAsmParser {
 
   ParseStatus parseRegReg(OperandVector &Operands);
   ParseStatus parseXSfmmVType(OperandVector &Operands);
-  ParseStatus parseRetval(OperandVector &Operands);
   ParseStatus parseZcmpStackAdj(OperandVector &Operands,
                                 bool ExpectNegative = false);
   ParseStatus parseZcmpNegStackAdj(OperandVector &Operands) {
@@ -1655,10 +1654,6 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, -1, (1 << 5) - 1,
         "immediate must be non-zero in the range");
-  case Match_InvalidXSfmmVType: {
-    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
-    return generateXSfmmVTypeError(ErrorLoc);
-  }
   case Match_InvalidVTypeI: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return generateVTypeError(ErrorLoc);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index d77a44a0d9e01..a3e02ee4fc430 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -13,7 +13,6 @@
 def XSfmmVTypeAsmOperand : AsmOperandClass {
   let Name = "XSfmmVType";
   let ParserMethod = "parseXSfmmVType";
-  let DiagnosticType = "InvalidXSfmmVType";
   let RenderMethod = "addVTypeIOperands";
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index fba1ccf2c8c9b..f153db177cac1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2638,16 +2638,6 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   if (Shr->isExact())
     return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));
 
-  if (C.isZero()) {
-    // == 0 is u< 1.
-    if (Pred == CmpInst::ICMP_EQ)
-      return new ICmpInst(CmpInst::ICMP_ULT, X,
-                          ConstantInt::get(ShrTy, (C + 1).shl(ShAmtVal)));
-    else
-      return new ICmpInst(CmpInst::ICMP_UGT, X,
-                          ConstantInt::get(ShrTy, (C + 1).shl(ShAmtVal) - 1));
-  }
-
   if (Shr->hasOneUse()) {
     // Canonicalize the shift into an 'and':
     // icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt)
diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
index 55c3e7779478e..afabf6ce0fdf2 100644
--- a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
+++ b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
@@ -228,7 +228,7 @@ define i64 @known_power_of_two_urem_loop_lshr(i64 %size, i64 %a) {
 ; CHECK-NEXT:    [[UREM:%.*]] = and i64 [[SIZE:%.*]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD]] = add nuw i64 [[SUM]], [[UREM]]
 ; CHECK-NEXT:    [[I]] = lshr i64 [[PHI]], 1
-; CHECK-NEXT:    [[ICMP_NOT:%.*]] = icmp ult i64 [[PHI]], 2
+; CHECK-NEXT:    [[ICMP_NOT:%.*]] = icmp eq i64 [[I]], 0
 ; CHECK-NEXT:    br i1 [[ICMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i64 [[SUM]]
@@ -328,7 +328,7 @@ define i64 @known_power_of_two_urem_loop_ashr_negative_2(i64 %size, i64 %a) {
 ; CHECK-NEXT:    [[UREM:%.*]] = urem i64 [[SIZE:%.*]], [[PHI]]
 ; CHECK-NEXT:    [[ADD]] = add nsw i64 [[SUM]], [[UREM]]
 ; CHECK-NEXT:    [[I]] = ashr i64 [[PHI]], 2
-; CHECK-NEXT:    [[ICMP_NOT:%.*]] = icmp ult i64 [[PHI]], 4
+; CHECK-NEXT:    [[ICMP_NOT:%.*]] = icmp eq i64 [[I]], 0
 ; CHECK-NEXT:    br i1 [[ICMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i64 [[SUM]]
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-indirect-call.ll b/llvm/test/CodeGen/AArch64/arm64ec-indirect-call.ll
new file mode 100644
index 0000000000000..e6a42c382e4f6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-indirect-call.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+
+define void @simple(ptr %g) {
+; CHECK-LABEL:  "#simple":
+; CHECK:        str     x30, [sp, #-16]!
+; CHECK-NEXT:   .seh_save_reg_x x30, 16
+; CHECK-NEXT:   .seh_endprologue
+; CHECK-NEXT:   adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:   adrp    x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:   add     x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:   ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:   mov     x11, x0
+; CHECK-NEXT:   blr     x8
+; CHECK-NEXT:   blr     x11
+; CHECK-NEXT:   .seh_startepilogue
+; CHECK-NEXT:   ldr     x30, [sp], #16
+; CHECK-NEXT:   .seh_save_reg_x x30, 16
+; CHECK-NEXT:   .seh_endepilogue
+; CHECK-NEXT:   ret
+
+entry:
+  call void %g()
+  ret void
+}
+
+; Make sure the check for the security cookie doesn't use x9.
+define void @stackguard(ptr %g) sspreq {
+; CHECK-LABEL:  "#stackguard":
+; CHECK:          adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     adrp    x8, __security_cookie
+; CHECK-NEXT:     ldr     x10, [sp, #8]
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__security_cookie]
+; CHECK-NEXT:     cmp     x8, x10
+; CHECK-NEXT:     b.ne    .LBB1_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT:     fmov    d0, #1.00000000
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp, #16]
+; CHECK-NEXT:     .seh_save_reg   x30, 16
+; CHECK-NEXT:     add     sp, sp, #32
+; CHECK-NEXT:     .seh_stackalloc 32
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+
+entry:
+  %call = tail call double %g(double noundef 1.000000e+00)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 4349b18fd394c..b31e87c54b563 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -31,10 +31,10 @@
 ; GFX1200-MESA: .long 45100
 ; GFX1200-MESA-NEXT: .long 1024
 
-; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x100
 
 ; GFX1250-MESA: .long 45100
-; GFX1250-MESA-NEXT: .long 512
+; GFX1250-MESA-NEXT: .long 256
 
 @lds = internal addrspace(3) global [4096 x i8] poison
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
index 3db0fa8f21759..7e8d5e0f30b9e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_lds_i32(i32 %val) {
 ; GCN-LABEL: test_lds_array_i8:
 ; GCN: .amdhsa_group_segment_fixed_size 327680
 ; GCN: ; LDSByteSize: 327680 bytes/workgroup
-; MESA: granulated_lds_size = 320
+; MESA: granulated_lds_size = 160
 define amdgpu_kernel void @test_lds_array_i8() {
   %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
   %val = load i8, ptr addrspace(3) %gep
@@ -52,7 +52,7 @@ define amdgpu_kernel void @test_lds_array_i8() {
 ; GCN-LABEL: test_lds_array_i16:
 ; GCN: .amdhsa_group_segment_fixed_size 327680
 ; GCN: ; LDSByteSize: 327680 bytes/workgroup
-; MESA: granulated_lds_size = 320
+; MESA: granulated_lds_size = 160
 define amdgpu_kernel void @test_lds_array_i16() {
   %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
   %val = load i16, ptr addrspace(3) %gep
@@ -63,7 +63,7 @@ define amdgpu_kernel void @test_lds_array_i16() {
 ; GCN-LABEL: test_lds_array_i32:
 ; GCN: .amdhsa_group_segment_fixed_size 327680
 ; GCN: ; LDSByteSize: 327680 bytes/workgroup
-; MESA: granulated_lds_size = 320
+; MESA: granulated_lds_size = 160
 define amdgpu_kernel void @test_lds_array_i32() {
   %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
   %val = load i32, ptr addrspace(3) %gep
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
index f934c85f68e0f..68694faf833e9 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
@@ -114,7 +114,7 @@
 ; CHECK-NEXT:        .entry_point:    _amdgpu_gs
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
 ; CHECK-NEXT:        .forward_progress: true
-; CHECK-NEXT:      .lds_size:       0x400
+; CHECK-NEXT:      .lds_size:       0x800
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll
new file mode 100644
index 0000000000000..b3575c68b892f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll
@@ -0,0 +1,215 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx950 <%s | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; TotalNumSgprs: 6
+; CHECK: ; NumVgprs: 1
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .compute_registers:
+; CHECK-NEXT:       .tg_size_en:     true
+; CHECK-NEXT:       .tgid_x_en:      false
+; CHECK-NEXT:       .tgid_y_en:      false
+; CHECK-NEXT:       .tgid_z_en:      false
+; CHECK-NEXT:       .tidig_comp_cnt: 0x1
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:      .ps_extra_lds_size: 0
+; CHECK-NEXT:      .spi_ps_input_addr:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: true
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: true
+; CHECK-NEXT:        .linear_centroid_ena: true
+; CHECK-NEXT:        .linear_sample_ena: true
+; CHECK-NEXT:        .persp_center_ena: true
+; CHECK-NEXT:        .persp_centroid_ena: true
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: true
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:      .spi_ps_input_ena:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: false
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: false
+; CHECK-NEXT:        .linear_centroid_ena: false
+; CHECK-NEXT:        .linear_sample_ena: false
+; CHECK-NEXT:        .persp_center_ena: false
+; CHECK-NEXT:        .persp_centroid_ena: false
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: false
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:	     .agpr_count:     0
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_cs
+; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: false
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    false
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0xa
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x20
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .gs:
+; CHECK-NEXT:        .agpr_count:     0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_gs
+; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .forward_progress: false
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0x500
+; CHECK-NEXT:        .mem_ordered:    false
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x6
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .hs:
+; CHECK-NEXT:        .agpr_count:     0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_hs
+; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .forward_progress: false
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0xa00
+; CHECK-NEXT:        .mem_ordered:    false
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x6
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .ps:
+; CHECK-NEXT:        .agpr_count:     0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_ps
+; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .forward_progress: false
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    false
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x6
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK:    .registers:      {}
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+define amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %arg1 to i64
+  %i3 = or i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = and i32 %arg2, 1023
+  %i6 = lshr i32 %arg2, 10
+  %i7 = and i32 %i6, 1023
+  %i8 = add nuw nsw i32 %i7, %i5
+  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %.idx = shl nuw nsw i32 %i8, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @ps_shader() #1 {
+  ret void
+}
+
+@LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define amdgpu_gs void @gs_shader() {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+@LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define amdgpu_hs void @hs_shader() {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size \B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
new file mode 100644
index 0000000000000..f47c2f2a85156
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | FileCheck %s
+; RUN: %if ptxas-sm_80 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx81 | %ptxas-verify -arch=sm_80 %}
+
+; CHECK-LABEL: cvt_rna_satfinite_tf32_f32
+define i32 @cvt_rna_satfinite_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rna_satfinite_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rna_satfinite_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rna.satfinite.tf32.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1)
+  ret i32 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm89.ll b/llvm/test/CodeGen/NVPTX/convert-sm89.ll
index 616dcfa330e81..170c120162cc3 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm89.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm89.ll
@@ -84,10 +84,3 @@ define <2 x half> @cvt_rn_relu_f16x2_e5m2x2(i16 %in) {
   %val = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %in);
   ret <2 x half> %val
 }
-
-; CHECK-LABEL: cvt_rna_satfinite_tf32_f32
-define i32 @cvt_rna_satfinite_tf32_f32(float %f1) {
-; CHECK: cvt.rna.satfinite.tf32.f32
-  %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1)
-  ret i32 %val
-}
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo.tbd b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo_arm64.tbd
similarity index 100%
rename from llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo.tbd
rename to llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo_arm64.tbd
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo_arm64e.tbd b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo_arm64e.tbd
new file mode 100644
index 0000000000000..7b21ab0cff165
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/MachO_Foo_arm64e.tbd
@@ -0,0 +1,23 @@
+--- !tapi-tbd
+tbd-version: 4
+targets:  [ arm64e-macos ]
+uuids:
+  - target: arm64e-macos
+    value: 00000000-0000-0000-0000-000000000000
+flags: [ installapi ]
+install-name: Foo.framework/Foo
+current-version: 1.2.3
+compatibility-version: 1.2
+swift-abi-version: 5
+parent-umbrella:
+  - targets: [ arm64e-macos ]
+    umbrella: System
+exports:
+  - targets: [ arm64e-macos ]
+    symbols: [ _foo ]
+    objc-classes: []
+    objc-eh-types: []
+    objc-ivars: []
+    weak-symbols: []
+    thread-local-symbols: []
+...
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_weak_link.test b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_weak_link.test
index 4326a604297b6..37847918bfe88 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_weak_link.test
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_weak_link.test
@@ -1,8 +1,14 @@
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: llvm-mc -triple=arm64-apple-darwin19 -filetype=obj -o %t/main.o \
 # RUN:         %S/Inputs/MachO_main_ret_foo.s
-# RUN: llvm-jitlink -noexec %t/main.o -weak_library %S/Inputs/MachO_Foo.tbd
-
+# RUN: llvm-jitlink -noexec %t/main.o -weak_library \
+# RUN:     %S/Inputs/MachO_Foo_arm64.tbd
+# RUN: llvm-jitlink -noexec %t/main.o -weak_library \
+# RUN:     %S/Inputs/MachO_Foo_arm64e.tbd
+#
 # Check that we can load main.o, which unconditionally uses symbol foo, by
 # using -weak_library on a TBD file to emulate forced weak linking against
 # a library that supplies foo, but is missing at runtime.
+#
+# Check that weak linking works for arm64 JIT'd programs even if the TBD
+# file contains only an arm64e interface.
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
index 80a340c1f6261..566e8554765ca 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
@@ -52,7 +52,7 @@
 // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000
 // max_lds_size
-// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0100 00000500 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000
@@ -231,13 +231,13 @@ max_vgprs:
 
 .p2align 6
 .amdhsa_kernel max_lds_size
- .amdhsa_group_segment_fixed_size 393216
+ .amdhsa_group_segment_fixed_size 327680
  .amdhsa_next_free_vgpr 1
  .amdhsa_next_free_sgpr 1
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel max_lds_size
-// ASM: .amdhsa_group_segment_fixed_size 393216
+// ASM: .amdhsa_group_segment_fixed_size 327680
 // ASM: .end_amdhsa_kernel
 
 // Test maximum VGPR allocation
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
index 642e62df0437a..0d6bc61ac7753 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
@@ -52,7 +52,7 @@
 // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000
 // max_lds_size
-// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0100 00000500 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000
@@ -231,13 +231,13 @@ max_vgprs:
 
 .p2align 6
 .amdhsa_kernel max_lds_size
- .amdhsa_group_segment_fixed_size 393216
+ .amdhsa_group_segment_fixed_size 327680
  .amdhsa_next_free_vgpr 1
  .amdhsa_next_free_sgpr 1
 .end_amdhsa_kernel
 
 // ASM: .amdhsa_kernel max_lds_size
-// ASM: .amdhsa_group_segment_fixed_size 393216
+// ASM: .amdhsa_group_segment_fixed_size 327680
 // ASM: .end_amdhsa_kernel
 
 // Test maximum VGPR allocation
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr.ll b/llvm/test/Transforms/InstCombine/icmp-shr.ll
index 8aceba04e0aeb..532e8b014b328 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr.ll
@@ -579,7 +579,7 @@ define i1 @ashr_ugt_0(i4 %x) {
 define i1 @ashr_ugt_0_multiuse(i4 %x, ptr %p) {
 ; CHECK-LABEL: @ashr_ugt_0_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i4 [[X:%.*]], 1
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i4 [[X]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i4 [[S]], 0
 ; CHECK-NEXT:    store i4 [[S]], ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -934,7 +934,7 @@ define i1 @lshr_eq_0_multiuse(i8 %x) {
 ; CHECK-LABEL: @lshr_eq_0_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = lshr i8 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @use(i8 [[S]])
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[X]], 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[S]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %s = lshr i8 %x, 2
@@ -947,7 +947,7 @@ define i1 @lshr_ne_0_multiuse(i8 %x) {
 ; CHECK-LABEL: @lshr_ne_0_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = lshr i8 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @use(i8 [[S]])
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i8 [[X]], 3
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[S]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %s = lshr i8 %x, 2
@@ -960,7 +960,7 @@ define i1 @ashr_eq_0_multiuse(i8 %x) {
 ; CHECK-LABEL: @ashr_eq_0_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i8 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @use(i8 [[S]])
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[X]], 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[S]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %s = ashr i8 %x, 2
@@ -973,7 +973,7 @@ define i1 @ashr_ne_0_multiuse(i8 %x) {
 ; CHECK-LABEL: @ashr_ne_0_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i8 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @use(i8 [[S]])
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i8 [[X]], 3
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[S]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %s = ashr i8 %x, 2
@@ -982,6 +982,46 @@ define i1 @ashr_ne_0_multiuse(i8 %x) {
   ret i1 %c
 }
 
+define i1 @lshr_eq_0(i8 %x) {
+; CHECK-LABEL: @lshr_eq_0(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[X:%.*]], 4
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = lshr i8 %x, 2
+  %c = icmp eq i8 %s, 0
+  ret i1 %c
+}
+
+define i1 @lshr_ne_0(i8 %x) {
+; CHECK-LABEL: @lshr_ne_0(
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i8 [[X:%.*]], 3
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = lshr i8 %x, 2
+  %c = icmp ne i8 %s, 0
+  ret i1 %c
+}
+
+define i1 @ashr_eq_0(i8 %x) {
+; CHECK-LABEL: @ashr_eq_0(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[X:%.*]], 4
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr i8 %x, 2
+  %c = icmp eq i8 %s, 0
+  ret i1 %c
+}
+
+define i1 @ashr_ne_0(i8 %x) {
+; CHECK-LABEL: @ashr_ne_0(
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i8 [[X:%.*]], 3
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %s = ashr i8 %x, 2
+  %c = icmp ne i8 %s, 0
+  ret i1 %c
+}
+
 define i1 @lshr_exact_eq_0_multiuse(i8 %x) {
 ; CHECK-LABEL: @lshr_exact_eq_0_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = lshr exact i8 [[X:%.*]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 76fa6bdb543a7..f37f62ffa8aa0 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -914,7 +914,7 @@ define float @scalarize_induction_variable_02(ptr %a, ptr %b, i64 %n) {
 ; IND-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; IND-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
 ; IND-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 9
+; IND-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i64 [[TMP1]], 0
 ; IND-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 4611686018427387902
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index a598f154ef54b..010890e30f4eb 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -65,7 +65,7 @@ define void @scalar_store(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
@@ -125,7 +125,7 @@ define void @expansion(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp eq i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index 4274719f2efd3..9ff9f92c4edca 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -9,13 +9,11 @@ target triple = "thumbv6m-none-none-eabi"
 define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef %pResult) #0 {
 ; CHECK-LABEL: @arm_mean_q7(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_NOT10:%.*]] = icmp ult i32 [[BLOCKSIZE:%.*]], 16
-; CHECK-NEXT:    br i1 [[CMP_NOT10]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK:       while.body.preheader:
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[BLOCKSIZE]], 4
-; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[BLOCKSIZE:%.*]], 4
+; CHECK-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[SHR]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT10]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[SUM_013:%.*]] = phi i32 [ [[TMP2:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_013:%.*]] = phi i32 [ [[TMP2:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[PSRC_ADDR_012:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRC:%.*]], [[WHILE_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[BLKCNT_011:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[PSRC_ADDR_012]], align 1
@@ -30,8 +28,8 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP3]]
 ; CHECK-NEXT:    br label [[WHILE_END]]
 ; CHECK:       while.end:
-; CHECK-NEXT:    [[PSRC_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PSRC]], [[ENTRY:%.*]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP2]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[PSRC_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PSRC]], [[WHILE_BODY_PREHEADER]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP2]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[BLOCKSIZE]], 15
 ; CHECK-NEXT:    [[CMP2_NOT15:%.*]] = icmp eq i32 [[AND]], 0
 ; CHECK-NEXT:    br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll
index eb5e279947ecb..3585fe9f757d3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll
@@ -32,7 +32,7 @@ define i32 @ctlz_loop_with_abs(i32 %n) {
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP1]] = lshr i32 [[N_ADDR_03]], 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp samesign ult i32 [[N_ADDR_03]], 2
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC]], [[WHILE_BODY]] ]
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 50b4ac372b4e4..217e521b2e43e 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -27,7 +27,6 @@
 #include "llvm/ExecutionEngine/Orc/EPCDebugObjectRegistrar.h"
 #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/GetDylibInterface.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h"
 #include "llvm/ExecutionEngine/Orc/JITLinkReentryTrampolines.h"
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index 84384217897c4..0034cd9993b88 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -35,7 +35,6 @@ static_library("Orc") {
     "ExecutionUtils.cpp",
     "ExecutorProcessControl.cpp",
     "ExecutorResolutionGenerator.cpp",
-    "GetDylibInterface.cpp",
     "IRCompileLayer.cpp",
     "IRPartitionLayer.cpp",
     "IRTransformLayer.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index e47ca1e07670a..e5599cb80c0ea 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -69,6 +69,12 @@ tablegen("AMDGPUGenRegisterBank") {
   td_file = "AMDGPU.td"
 }
 
+tablegen("AMDGPUGenSDNodeInfo") {
+  visibility = [ ":LLVMAMDGPUCodeGen" ]
+  args = [ "-gen-sd-node-info" ]
+  td_file = "AMDGPU.td"
+}
+
 tablegen("InstCombineTables") {
   visibility = [ ":LLVMAMDGPUCodeGen" ]
   args = [ "-gen-searchable-tables" ]
@@ -103,6 +109,7 @@ static_library("LLVMAMDGPUCodeGen") {
     ":AMDGPUGenPreLegalizeGICombiner",
     ":AMDGPUGenRegBankGICombiner",
     ":AMDGPUGenRegisterBank",
+    ":AMDGPUGenSDNodeInfo",
     ":InstCombineTables",
     ":R600GenCallingConv",
     ":R600GenDAGISel",
@@ -161,6 +168,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPULibCalls.cpp",
     "AMDGPULibFunc.cpp",
     "AMDGPULowerBufferFatPointers.cpp",
+    "AMDGPULowerExecSync.cpp",
     "AMDGPULowerIntrinsics.cpp",
     "AMDGPULowerKernelArguments.cpp",
     "AMDGPULowerKernelAttributes.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
index 2bce96859f8bc..c368cfe46405e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
@@ -30,6 +30,12 @@ tablegen("PPCGenRegisterBank") {
   td_file = "PPC.td"
 }
 
+tablegen("PPCGenSDNodeInfo") {
+  visibility = [ ":LLVMPowerPCCodeGen" ]
+  args = [ "-gen-sd-node-info" ]
+  td_file = "PPC.td"
+}
+
 static_library("LLVMPowerPCCodeGen") {
   deps = [
     ":PPCGenCallingConv",
@@ -37,6 +43,7 @@ static_library("LLVMPowerPCCodeGen") {
     ":PPCGenFastISel",
     ":PPCGenGlobalISel",
     ":PPCGenRegisterBank",
+    ":PPCGenSDNodeInfo",
     "AsmParser:PPCGenAsmMatcher",
     "MCTargetDesc",
     "TargetInfo",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/BUILD.gn
index a5718e0c48feb..f360b01274705 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/BUILD.gn
@@ -12,10 +12,17 @@ tablegen("SystemZGenDAGISel") {
   td_file = "SystemZ.td"
 }
 
+tablegen("SystemZGenSDNodeInfo") {
+  visibility = [ ":LLVMSystemZCodeGen" ]
+  args = [ "-gen-sd-node-info" ]
+  td_file = "SystemZ.td"
+}
+
 static_library("LLVMSystemZCodeGen") {
   deps = [
     ":SystemZGenCallingConv",
     ":SystemZGenDAGISel",
+    ":SystemZGenSDNodeInfo",
     "MCTargetDesc",
     "TargetInfo",
     "//llvm/include/llvm/Config:llvm-config",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index f22ee4f31741b..2a1348ffb5295 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -90,7 +90,6 @@ static_library("LLVMX86CodeGen") {
     "X86CmovConversion.cpp",
     "X86CodeGenPassBuilder.cpp",
     "X86CompressEVEX.cpp",
-    "X86DiscriminateMemOps.cpp",
     "X86DomainReassignment.cpp",
     "X86DynAllocaExpander.cpp",
     "X86ExpandPseudo.cpp",
@@ -110,7 +109,6 @@ static_library("LLVMX86CodeGen") {
     "X86ISelLoweringCall.cpp",
     "X86IndirectBranchTracking.cpp",
     "X86IndirectThunks.cpp",
-    "X86InsertPrefetch.cpp",
     "X86InsertWait.cpp",
     "X86InstCombineIntrinsic.cpp",
     "X86InstrFMA3Info.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index e40a8ee04dd38..61dedc1ab0f69 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -32,7 +32,6 @@ unittest("CodeGenTests") {
     "LexicalScopesTest.cpp",
     "LowLevelTypeTest.cpp",
     "MIR2VecTest.cpp",
-    "MLRegAllocDevelopmentFeatures.cpp",
     "MachineBasicBlockTest.cpp",
     "MachineDomTreeUpdaterTest.cpp",
     "MachineInstrBundleIteratorTest.cpp",
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 79bc380dbcb7a..0164a2fb9fa81 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -628,6 +628,8 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
            /*default=*/"false",
            "Replace memref arguments in GPU functions with bare pointers. "
            "All memrefs must have static shape.">,
+    Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true",
+           "Experimental performance flag to disallow pattern rollback">,
     ListOption<"allowedDialects", "allowed-dialects", "std::string",
                "Run conversion patterns of only the specified dialects">,
   ];
diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
index fccb49d49da70..34c85de3418ec 100644
--- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
@@ -58,6 +58,10 @@ struct GPUToNVVMPipelineOptions
           "Whether to use the bareptr calling convention on the host (warning "
           "this should be false until the GPU layering is fixed)"),
       llvm::cl::init(false)};
+  PassOptions::Option<bool> allowPatternRollback{
+      *this, "allow-pattern-rollback",
+      llvm::cl::desc("Allow pattern rollback during dialect conversion"),
+      llvm::cl::init(true)};
 };
 
 // Options for the gpu to xevm pipeline.
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index d64c4d64cad84..5848489274c13 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -419,7 +419,10 @@ struct LowerGpuOpsToNVVMOpsPass final
     if (this->hasRedux)
       populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns);
     configureGpuToNVVMConversionLegality(target);
-    if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
+    ConversionConfig config;
+    config.allowPatternRollback = allowPatternRollback;
+    if (failed(
+            applyPartialConversion(m, target, std::move(llvmPatterns), config)))
       signalPassFailure();
   }
 };
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 2c3e4661d266a..5462cddd44718 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -72,6 +72,7 @@ void buildGpuPassPipeline(OpPassManager &pm,
   ConvertGpuOpsToNVVMOpsOptions opt;
   opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv;
   opt.indexBitwidth = options.indexBitWidth;
+  opt.allowPatternRollback = options.allowPatternRollback;
   pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps(opt));
   pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
   pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index a4b5dde8a2187..f1cc1eb983267 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allow-pattern-rollback=0' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allowed-dialects=func,arith,cf' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-bare-ptr-memref-call-conv=1' -split-input-file | FileCheck %s --check-prefix=CHECK-BARE
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
diff --git a/mlir/test/Conversion/GPUToNVVM/memref.mlir b/mlir/test/Conversion/GPUToNVVM/memref.mlir
index e164ca9103dee..a4e8ead344114 100644
--- a/mlir/test/Conversion/GPUToNVVM/memref.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/memref.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s -convert-gpu-to-nvvm | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm="allow-pattern-rollback=0" | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' \
 // RUN: | FileCheck %s --check-prefix=BARE
 
diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
index b479467efc208..82c02c1d6ee63 100644
--- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-gpu-to-nvvm="allow-pattern-rollback=0" --split-input-file %s | FileCheck %s
 // RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
 
 gpu.module @test_module {
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
index 5585d98c25b82..d0001f6ffc376 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
index cd90ce3ba2f1a..fcff5f40a6cc7 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
index fec2567f47f15..4718ac94fa0f2 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
index d5633b00313b3..5e3a7e7e7d729 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
index db297b0fc27b7..f1a48ae0c19c5 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
index 65cbc79752177..f0a46cea7ceb9 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
index a0c955e4b570c..ddbabd4ddf960 100644
--- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
+++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
index f041df82b4325..5c56e2ddfbd51 100644
--- a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
+++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir
index 71a21cf4bd620..83cf70cd17078 100644
--- a/mlir/test/Integration/GPU/CUDA/assert.mlir
+++ b/mlir/test/Integration/GPU/CUDA/assert.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir
index 34dde6e03c80e..77a4fa089b62d 100644
--- a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir
+++ b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" -debug-only=serialize-to-binary \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8' allow-pattern-rollback=0" -debug-only=serialize-to-binary \
 // RUN:  2>&1 | FileCheck %s
 
 func.func @host_function(%arg0 : f32, %arg1 : memref<?xf32>) {
diff --git a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir
index ed01416d9523a..51f6e36aaa977 100644
--- a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir
+++ b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir
@@ -2,7 +2,7 @@
 // increment a global atomic counter and wait for the counter to reach 2.
 //
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | env CUDA_MODULE_LOADING=EAGER mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
index 27ec1ec435fef..efffcaaf23b2e 100644
--- a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
+++ b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=serialize-to-isa \
 // RUN:  2>&1 | FileCheck %s
 
 // CHECK-LABEL: Generated by LLVM NVPTX Back-End
diff --git a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir
index d32f5efc29d58..f810678569615 100644
--- a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir
+++ b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=dump-sass \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=dump-sass \
 // RUN:  2>&1 | FileCheck %s
 
 // CHECK: MOV
diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
index 07f3218ae89b2..fe3c2b1d93a1b 100644
--- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
+++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
index b2ac90acde94f..f8f1aa8aaa42e 100644
--- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
+++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index fd664f2331488..ef116760b69e5 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
index a6207d64c038b..a4be5223cd792 100644
--- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
index c3cee2fda46f3..3490003d6ba19 100644
--- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir
+++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \
 // RUN: | mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 85c64ffd58ca6..635f77215b38f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2934,6 +2934,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-exegesis"],
                 "lib/Target/PowerPC/PPCGenExegesis.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/PowerPC/PPCGenSDNodeInfo.inc",
+            ),
         ],
     },
     {